#!/usr/bin/perl # The first line might be #!/usr/local/bin/perl etc # depending on the system. # # Do not delete this comment.##################### if ( $#ARGV < 0 ) { print "usage: $0 infile_root.\n"; print "This script reads two files **.fa and **.p.table\n"; print "and checks duplication. Output is a sorted list:\n"; print "**.gfa and **.g.table.\n"; print "This version is used when the sizes of table and list are identical.\n"; exit 1; } $| = 1; $infile_root = $ARGV[0]; $fafile = $infile_root . ".fa"; $ptable = $infile_root . ".p.table"; print "Input files are $fafile and $ptable.\n"; open(FAFILE, $fafile) || die "Unable to open file: $fafile.\n"; open(PTABLE, $ptable) || die "Unable to open file: $ptable.\n"; @tabledata = ; close PTABLE; @sorted_tabledata = sort @tabledata; $number_tabledata = @tabledata; @fadata = (); $i = 0; while() { $line = $_; if ( $line eq "\n" ) { next; } if ( $line =~ /^\>/ ) { chomp($line); $line =~ s/^\>//; $line = $i++ . "\t" . $line; push(@fadata,$line); } } close(FAFILE); @sorted_fadata = sort fasort @fadata; $number_fadata = @fadata; sub fasort { @a1 = split(/\t/,$a); @b1 = split(/\t/,$b); return $a1[1] cmp $b1[1]; } print "Number of items in $ptable = $number_tabledata.\n"; print "Number of items in $fafile = $number_fadata.\n"; $error = 0; print "\n"; for( $i = 0; $i < $number_tabledata - 1; $i++ ) { print "\r"; print $i; $line = $sorted_tabledata[$i]; $line2 = $sorted_tabledata[$i+1]; chomp($line); chomp($line2); @items = split(/\t/,$line); @items2 = split(/\t/,$line2); if($items[0] =~ /ORF\s+/) { print "Name should not be simply ORF in $ptable line $i.\n"; $error += 1; } if ( $items[0] eq $items2[0] ) { print "Name duplication detected in $ptable line $i: $items[0].\n"; $error += 1; } } print "\n"; for( $i = 0; $i < $number_fadata - 1; $i++ ) { print "\r"; print $i; $line = $sorted_fadata[$i]; $sorted_fadata[$i] = $i . "\t" . $sorted_fadata[$i]; $line2 = $sorted_fadata[$i+1]; chomp($line); chomp($line2); @items = split(/\s+/,$line); @items2 = split(/\s+/,$line2); if($items[1] =~ /ORF\s+/) { print "Name should not be simply ORF in $fafile line $i.\n"; print "$sorted_fadata[$i]\n"; $error += 1; } if ( $items[1] eq $items2[1] ) { print "Name duplication detected in $fafile sequence $i: $items[1].\n"; $error += 1; } } $sorted_fadata[$i] = $i . "\t" . $sorted_fadata[$i]; if ( $error > 0 ) { print "Error in names was found. $error errors.\n"; exit 1; } if ( $number_fadata != $number_tabledata ) { print "Number of items in $ptable is not equal to the number of sequences in $fafile.\n"; print "They are, respectively, $number_tabledata and $number_fadata.\n"; exit 1; } print "\n"; $error = 0; for( $i = 0; $i < $number_tabledata; $i++ ) { print "\r"; print $i; $line = $sorted_tabledata[$i]; chomp($line); @items = split(/\t/,$line); $line2 = $sorted_fadata[$i]; chomp($line2); @items2 = split(/\s+/,$line2); if ( $items[0] eq $items2[2] ) { last }; for( $j = 0; $j < $number_fadata; $j++ ) { $line2 = $sorted_fadata[$j]; chomp($line2); @items2 = split(/\s+/,$line2); if ( $items[0] eq $items2[2] ) { last }; } if ( $j == $number_fadata ) { print "Name in $ptable is not found in $fafile: $items[0].\n"; $error += 1; } } if ( $error > 0 ) { print "Name inconsistency error was found. $error errors.\n"; exit 1; } @recovered_fadata = sort(numeric @sorted_fadata); sub numeric { @a1 = split(/\t/,$a); @b1 = split(/\t/,$b); return $a1[1]<=>$b1[1]; } #open(TABLE, "> table") || die "Unable to open file: table.\n"; #print TABLE @recovered_fadata , "\n"; #close TABLE; print "\n"; print "Making fa file and modifying gene name ...\n"; print "and making gclust table ...\n"; $gfafile = $infile_root . ".gfa"; $gtable = $infile_root . ".g.table"; print "Output files are $gfafile and $gtable.\n"; open(GTABLE, "> $gtable") || die "Unable to open file: $gtable.\n"; print GTABLE "GCLUST_TABLE3\t$number_tabledata\n"; if ( -f $gfafile ) { system ("rm -f $gfafile" ); } open(FAFILE, $fafile) || die "Unable to open file: $fafile.\n"; open(OUT, "> out") || die "Unable to open temporary file: out.\n"; $i=1; $k_bak = 0; while(){ print "\r"; print $i; $line = $_; if($line =~ /^\>/){ $line =~ s/^\>//; # print OUT ">g" . $i . " " . $line; print OUT ">g" . $i . "\n"; # annotation is removed from the gfa file @z = split(/\s+/,$line); $gene = $z[0]; @x = split(/\t/,$recovered_fadata[$i-1]); if($i - 1 != $x[1]) { print "Error occurred in finding items in recovered fadata.\n"; print "i=$i, x=$x[1]\n"; $offset = 0; } else { # $offset = $x[0] - 1; $offset = $k_bak; if($offset < 0){ $offset = 0; } } for( $j = 0; $j < $number_tabledata; $j++ ) { $k = $j + $offset; if($k >= $number_tabledata) { $k -= $number_tabledata; } $line2 = $tabledata[$k]; chomp($line2); @linelist = split(/\t/, $line2); #split line with a tab if($linelist[0] eq $gene){ $k_bak = $k; $annotlen = length($linelist[2]); if ( $annotlen > 255 ) { $newannot = substr($linelist[2],0,254); $linelist[2] = $newannot; $tabledata[$j] = $linelist[0] . "\t" . $linelist[1] . "\t" . $linelist[2] . "\n"; } elsif($annotlen == 0) { $linelist[2] = "no description"; $tabledata[$j] = $linelist[0] . "\t" . $linelist[1] . "\t" . $linelist[2] . "\n"; } $line2 = $i . "\t" . $line2 . "\n"; print GTABLE $line2; last; } } if($j ==$number_tabledata){ print STDERR "\nError! Item $gene is not found in the p.table.\n"; } $i++; } else { print OUT $line; } } close GTABLE; close OUT; close FAFILE; system ("mv out $gfafile"); print "Now $gtable and $gfafile are ready.\n"; #end