#!/usr/bin/perl # The first line might be #!/usr/local/bin/perl etc # depending on the system. # # Do not delete this comment.##################### if ( $#ARGV < 0 ) { print "usage: $0 homfile prefix_file table_file_root.\n"; print "Note that there are three outputs: *.tbl, *.tblA, and *.tblS.\n"; print "This program analyzes number of groups for each species.\n"; print "prefix file should contain number in the first line and \n"; print "prefixes to be counted in the following lines, one per line. \n"; print "Compatible with gclust version 3.5.2f.\n"; exit 1; } $| = 1; $infile = $ARGV[0]; # hom file $prefix_file = $ARGV[1]; # prefix file $table_file_root = $ARGV[2]; # table file root $tbl = $table_file_root . ".tbl"; $tblA = $tbl . "A"; # all list $tblS = $tbl . "S"; # sorted list @prefix = (); open(PREFIX, $prefix_file) || die "Unable to open file: $prefix_file.\n"; @prefix = ; $numprefix = $prefix[0]; chop($numprefix); $number = @prefix; if ( $number < $numprefix + 1 ) { print STDOUT "Error in reading prefix file.\n"; } close(PREFIX); open(TBL, "> $tbl") || die "Unable to open file: $tbl.\n"; print TBL "input file is $infile.\n"; print TBL "prefix file is $prefix_file.\n"; print TBL "Number\tID \tLength\tseqs homologs"; for ( $i = 1; $i <= $numprefix; $i++ ){ chop($prefix[$i]); $temp = $prefix[$i]; $temp =~ s/_$//; print TBL "\t", $temp; } print TBL "\tAnnotations \n\n"; @all = ""; $first_line = ""; $string = "SeqID \tGroup\tLength\tseqs\thomologs"; for ( $i = 1; $i <= $numprefix; $i++ ){ $temp = $prefix[$i]; $temp =~ s/_$//; $temp = "\t" . $temp; $string .= $temp; } $string .= "\tAnnotations \n\n"; $first_line = $string; $string = ""; open(INFILE, $infile) || die "Unable to open file: $infile.\n"; open(TBLA, "> $tblA") || die "Unable to open file: $tblA.\n"; open(TBLS, "> $tblS") || die "Unable to open file: $tblS.\n"; print TBLA $first_line; print TBLS $first_line; $flag = 0; $groups = 1; $count[0] = 0; for ( $i = 1; $i <= $numprefix; $i++ ) { $count[$i] = 0; } $annot = ""; $toskip = 0; while(){ $line = $_; if ( $toskip == 1 ) { if ( $line =~ /^END/ || $line =~ /END\ Related\ groups/ ) { $toskip = 0; next; } else { next; } } if ( $line =~ /Related\ groups:/ ) { if( $line =~ /END\ Related\ groups/ ) { $toskip = 0; next; } else { $toskip = 1; next; } } if ( $line =~ /^Input/ || $line =~ /^Output\ of\ gclust/ || $line =~ /^Homology/ || $line =~ /^Threshold/ ) { next; } if ( $line =~ /^Host\ name\:\ / || $line =~ /^Started\:\ / ) { next; } if ( $line =~ /repeat\ mode\ with/ || $line =~ /clique\ mode/ ) { next; } if ( $line eq "\n" ) { next; } if ( $line =~ /^List\ of\ variables/ ) { $toskip = 1; next; } chop($line); if ( $flag == 0 && $line =~ /^Group/ ) { @linelist = split(/\s+/, $line); #split line with more than one spaces as delimiter $groups = $linelist[1]; $groups =~ s/\:$//; $numseq = $linelist[2]; $numhomo = $numseq; $counter = 0; $profile = ""; $flag = 1; next; } @linelist = split(/\t/, $line); #split line with a tab $items = @linelist; if ( $flag == 1 ) { if ( $line eq "" ) { next; } $annot = $linelist[ $items - 1 ]; print TBL $groups, "\t", $linelist[0], "\t", $linelist[1], "\t"; $flag = 0; @IDlist = ""; @annot_list = ""; $IDlist = 0; } $counter += 1; $IDname = $linelist[0]; $annot = $linelist[ $items - 1 ]; $IDlist_temp = $linelist[0] . "\t" . $groups . "\t" . $linelist[1]; push(@IDlist,$IDlist_temp); push(@annot_list,$annot); for ( $i = 1; $i <= $numprefix; $i++ ) { if ( $IDname =~ /^$prefix[$i]/ ) { $count[$i]++; last; } } if ( $counter >= $numseq ) { $profile = $numseq . "\t" . $numhomo; for ( $i = 1; $i <= $numprefix; $i++ ) { $profile = $profile . "\t" . $count[$i]; } $profile2 = $profile . "\t" . $annot_list[1]; $profile2 .= "\n"; print TBL $profile2; for($j = 1; $j<=$numseq; $j++){ $tblA_line = $IDlist[$j] . "\t" . $profile . "\t" . $annot_list[$j] . "\n"; push(@all, $tblA_line); print TBLA $tblA_line; } print TBLA "\n"; $annot = ""; for ( $i = 1; $i <= $numprefix; $i++ ) { $count[$i] = 0; } $counter = 0; $numseq = 0; $numhomo = 0; } } @sorted = sort(@all); $number = $#sorted; for($j = 1; $j<=$number; $j++){ print TBLS $sorted[$j]; } print TBLS "\n"; close TBL; close TBLA; close TBLS; #end