/****************************************
tbsort2.c
This is a C version of tbsort.
Input files are: 
  table file (***.tbl)
  group definition file (grp_def)
  pattern definition file (pat_def)

Copyright: Naoki Sato 2005
****************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#define MAXLINE 512
#define TABLELINE 2000	/* Line length of table */

/* Structure definition */
typedef struct {
	int group;
	char ID[100];
	int length;
	int seqs;
	int *a;		/* sequences per species multiplied by num_sp */
	int *b;		/* species per pattern types multiplied by num_pat_type */
	int *c;		/* satisfied pattern types (1 or 0) multiplied by num_pat_type */
	char annot[100];
} homologGroup;

typedef struct {
	char name[100];
	int grpID;
} species;

typedef struct {
	int grpID;
	int number;
	double threshold;
	char annot[100];
} pattern;


/* globals */
int num_sp;		/* number of species */
int num_grp;	/* number of homolog groups */
int num_pat_type;	/* number of pattern types */
species *sp,*sp_temp;
pattern *pat;
homologGroup *HG;

char fileGroup[100];
char filePattern[100];
char fileTable[100];
char outfile[100];
int outflag;


/* prototype declaration */
int readGroup(void);
int readPattern(void);
int readTable(void);
void computePattern(void);
void printGroup(species *sp1);
void printPattern(void);
void printPatternTable(void);
void enumerateGroups(void);
void printSummary(void);
void help(void);


/***********************************/
void help(void)
/***********************************/
{
	printf("Usage: tbsort2 table_file number_of_groups outfile group_definition pattern_definition output_flag\n");
}


/***********************************/
int readTable(void)
/***********************************/
{
	int i,j,k;
	FILE *fin;
	char str1[TABLELINE+1];
	char temp[100];
	char *word;
	int grp_number;
	int error_flag;

	if((fin=fopen(fileTable,"r"))==NULL){
		fprintf(stderr,"Error in reading table file: %s\n",fileTable);
		exit(1);
	}
	error_flag = 0;
	i = 0;
	while(fgets(str1,TABLELINE,fin)!=NULL){
		if(!strcmp(str1,"\n")) continue;	/* March 13 */
		if(i==0){
			sscanf(str1,"%s",temp);
			if(strcmp(temp,"Number")){
				/* May 12, 2007 */
				if(error_flag == 0){
					fgets(str1,TABLELINE,fin);
					error_flag = 1;
					continue;
				} else {
					fprintf(stderr,"Error in table format.\n");
					return TRUE;
				}
			}
			word = strtok(str1,"\t");
			for(j=0;j<3;j++) word = strtok(NULL,"\t");
			for(j=0;j<num_sp;j++){
				word = strtok(NULL,"\t");
				for(k=0;k<num_sp;k++){
					if(!strcmp(word,sp_temp[k].name)){
						strcpy(sp[j].name,sp_temp[k].name);
						sp[j].grpID = sp_temp[k].grpID;
						break;
					}
				}
				if(k==num_sp){
					fprintf(stderr,"Name %s not found in species list.\n",word);
					fprintf(stderr,"Error in species names in the table header.\n");
					return TRUE;
				}
			}
			word = strtok(NULL,"\t");
			if(strncmp(word,"Annotations",11)){
				fprintf(stderr,"Error in species names in the table header 2.\n");
				return TRUE;
			}
	
			i = +1;
			continue;
		}

		if(!strcmp(str1,"\n")) continue;

		word = strtok(str1,"\t");
		sscanf(word,"%d",&grp_number);
		if(grp_number != i){
			fprintf(stderr,"Error in group number during the reading of table file.\n");
			return TRUE;
		}
		if(num_grp < grp_number){
			if((HG=(homologGroup*)realloc(HG,(grp_number+1)*sizeof(homologGroup)))==NULL){
				fprintf(stderr,"Memory allocation error readTable 1.\n");
				exit(1);
			}	
			for(i=num_grp+1;i<=grp_number;i++){
				if((HG[i].a=(int*)calloc(num_sp,sizeof(int)))==NULL){
					fprintf(stderr,"Memory allocation error at readTable 2.\n");
					exit(1);
				}
				if((HG[i].b=(int*)calloc(num_pat_type,sizeof(int)))==NULL){
					fprintf(stderr,"Memory allocation error at readTable 3.\n");
					exit(1);
				}
			}
			num_grp = grp_number;
		}
		HG[grp_number].group = grp_number;	
		word = strtok(NULL,"\t");
		strcpy(HG[grp_number].ID,word);
		word = strtok(NULL,"\t");
		sscanf(word,"%d",&HG[grp_number].length);
		word = strtok(NULL,"\t");
		sscanf(word,"%d",&HG[grp_number].seqs);
		word = strtok(NULL,"\t");

		for(j=0;j<num_sp;j++){
			word = strtok(NULL,"\t");
			sscanf(word,"%d",&HG[grp_number].a[j]);
		}

		word = strtok(NULL,"\t");
		strcpy(HG[grp_number].annot,word);
		i += 1;
	}
	printf("Table file was successfully read.\n");
	printf("Number of groups: %d\n",i);
	fflush(stdout);

	fclose(fin);	
	return FALSE;
}
/***********************************/



/***********************************/
int readGroup(void)
/***********************************/
{
	int i;
	FILE *fin;
	char str1[MAXLINE+1];
	char temp[100];
	int flag;


	if((fin=fopen(fileGroup,"r"))==NULL){
		fprintf(stderr,"Error in reading group file: %s\n",fileGroup);
		return TRUE;
	}
	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%d",&num_sp);
	if(num_sp <= 0){
		fprintf(stderr,"Incorrect number of species in group file.\n");
		return TRUE;
	}
	if((sp_temp=(species*)calloc(num_sp,sizeof(species)))==NULL){
		fprintf(stderr,"Momory allocation error in readGroup 1.\n");
		exit(1);
	}
	flag = 0;
	i = 0;
	while(fgets(str1,MAXLINE,fin)!=NULL){
		sscanf(str1,"%s",temp);
		if(!strncmp(temp,"//END",5)){
			flag = 1;
			break;
		}
		if(i >= num_sp){
			num_sp = i+1;
			if((sp_temp=(species*)realloc(sp_temp,num_sp*sizeof(species)))==NULL){
				fprintf(stderr,"Momory allocation error in readGroup 2.\n");
				exit(1);
			}
		}
		sscanf(str1,"%s %d",sp_temp[i].name,&sp_temp[i].grpID);
		i+=1;
	}

	if(flag==0){
		fprintf(stderr,"File end reached. //END line is missing.\n");
	}
	printf("Group definition file was successfully read.\n");
	printf("Number of species: %d\n",num_sp);

	fclose(fin);
	fflush(stdout);
	fflush(stderr);

	return FALSE;
}
/***********************************/



/***********************************/
int readPattern(void)
/***********************************/
{
	int i;
	FILE *fin;
	char str1[MAXLINE+1];
	char temp[100];
	int flag;


	if((fin=fopen(filePattern,"r"))==NULL){
		fprintf(stderr,"Error in reading pattern file: %s\n",filePattern);
		return TRUE;
	}
	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%d",&num_pat_type);
	if(num_pat_type <= 0){
		fprintf(stderr,"Incorrect number of pattern type in pattern file.\n");
		return TRUE;
	}
	if((pat=(pattern*)calloc(num_pat_type,sizeof(pattern)))==NULL){
		fprintf(stderr,"Momory allocation error in readPattern 1.\n");
		exit(1);
	}
	flag = 0;
	i = 0;
	while(fgets(str1,MAXLINE,fin)!=NULL){
		sscanf(str1,"%s",temp);
		if(!strncmp(temp,"//END",5)){
			flag = 1;
			break;
		}
		if(i >= num_pat_type){
			num_pat_type = i+1;
			if((pat=(pattern*)realloc(pat,num_pat_type*sizeof(pattern)))==NULL){
				fprintf(stderr,"Momory allocation error in readPattern 2.\n");
				exit(1);
			}
		}
		sscanf(str1,"%d %lf %s",&pat[i].grpID,&pat[i].threshold,pat[i].annot);
		i+=1;
	}

	if(flag==0){
		fprintf(stderr,"File end reached. //END line is missing.\n");
	}
	printf("Pattern definition file was successfully read.\n");
	printf("Number of patterns: %d\n",num_pat_type);

	fclose(fin);
	fflush(stdout);
	fflush(stderr);

	return FALSE;
}
/***********************************/


/***********************************/
void enumerateGroups(void)
/***********************************/
{
	int i,j;

	for(i=0;i<num_pat_type;i++){
		pat[i].number = 0;
		for(j=0;j<num_sp;j++){
			if(pat[i].grpID == sp_temp[j].grpID) pat[i].number += 1;
		}
	}
}
/***********************************/


/***********************************/
void computePattern(void)
/***********************************/
{
	int i,j,k;

	for(i=1;i<=num_grp;i++){
		for(j=0;j<num_pat_type;j++){
			HG[i].b[j] = 0;
			for(k=0;k<num_sp;k++){
				if(pat[j].grpID == sp[k].grpID){
					if(HG[i].a[k] > 0) HG[i].b[j] += 1;
				}
			}
			if(HG[i].b[j] >= pat[j].number * pat[j].threshold) HG[i].c[j] = 1;
			else HG[i].c[j] = 0;
		}
	}
}
/***********************************/



/***********************************/
void printGroup(species *sp1)
/***********************************/
{
	int i;

	printf("Group assignment list\n");
	printf("\n");
	printf("Species       Group\n");
	printf("\n");
	for(i=0;i<num_sp;i++){
		printf("%s\t%d\n",sp1[i].name,sp1[i].grpID);
	}
	printf("\n");
	fflush(stdout);
}
/***********************************/


/***********************************/
void printPattern(void)
/***********************************/
{
	int i;

	printf("Pattern definition\n");
	printf("\n");
	printf("Group    Number   Threshold    Annotation\n");
	printf("\n");
	for(i=0;i<num_pat_type;i++){
		printf("%3d\t%4d\t%6.2f\t%s\n",pat[i].grpID,pat[i].number,pat[i].threshold,pat[i].annot);
	}
	printf("\n");
	fflush(stdout);
}
/***********************************/


/***********************************/
void printPatternTable(void)
/***********************************/
{
	int i,j;
	FILE *fout;


	if((fout=fopen(outfile,"w"))==NULL){
		fprintf(stderr,"Error in writing pattern table\n");
		return;
	}
	
	fprintf(fout,"Group             Name       ");
	for(i=0;i<num_pat_type;i++) fprintf(fout,"%s ",pat[i].annot);
	fprintf(fout," ->-> ");
	for(i=0;i<num_pat_type;i++) fprintf(fout,"%s ",pat[i].annot);
	fprintf(fout,"\n");

	for(i=1;i<=num_grp;i++){		/* March 13 */
		fprintf(fout,"%5d %20s  ",HG[i].group,HG[i].ID);
		for(j=0;j<num_pat_type;j++){
			fprintf(fout,"%8d ",HG[i].b[j]);
		}
		fprintf(fout,"  ->-> ");
		for(j=0;j<num_pat_type;j++){
			fprintf(fout,"%8d ",HG[i].c[j]);
		}
		fprintf(fout,"\n");
	}
	fclose(fout);
	
}
/***********************************/


/***********************************/
void printSummary(void)
/***********************************/
{
	int i,j,m,n,n1,counter;
	int temp[num_pat_type];
	int max = pow(2,num_pat_type);
	FILE *fout;
	char outfile2[100];

	printf("Summary of patterns\n");
	for(i=0;i<num_pat_type;i++) printf("%8s ",pat[i].annot);
	printf("   Groups");
	printf("\n");

	for(m=0;m<num_pat_type;m++) temp[m] = 0;

	for(n=0;n<max;n++){
		n1 = n;
		strcpy(outfile2,outfile);
		for(m=0;m<num_pat_type;m++){
			if(n1 & 1) temp[m] = 1;
			else temp[m] = 0;
			printf("%8d ",temp[m]);
			if(temp[m]==1) strcat(outfile2,"1");
			else strcat(outfile2,"0");
			n1 /= 2;
		}
		printf("   ");
		counter = 0;
		if((fout=fopen(outfile2,"w"))==NULL){
			fprintf(stderr,"Error in writing pattern list: %s\n",outfile);
			continue;
		}
		for(i=1;i<=num_grp;i++){		/* March 13 */
			for(m=0;m<num_pat_type;m++){
				if(temp[m] != HG[i].c[m]) break;
			}
			if(m==num_pat_type){
				counter += 1;
				fprintf(fout,"%d\t%20s ",HG[i].group,HG[i].ID);
				for(j=0;j<num_sp;j++){
					fprintf(fout,"%5d ",HG[i].a[j]);
				}
				fprintf(fout,"\n");
			}
		}
		printf("%5d\n",counter);
		fclose(fout);
		
	}
	printf("\n");
	
}
/***********************************/

/*************************************************/
int main(int argc, char**argv)
/************************************************/
{
	int i;

/*	if(argc != 5){
*/
	if(argc == 1){
		help();
		exit(1);
	}
	strcpy(fileTable,argv[1]);
	num_grp = atoi(argv[2]);
	strcpy(outfile,argv[3]);
	strcpy(fileGroup,argv[4]);
	strcpy(filePattern,argv[5]);
	outflag = atoi(argv[6]);

	if(readGroup()) exit(1);
	printGroup(sp_temp);
	if(readPattern()) exit(1);
	enumerateGroups();
	printPattern();

/* now num_sp and num_pat_type are determined */
	if((HG=(homologGroup*)calloc(num_grp+1,sizeof(homologGroup)))==NULL){
		fprintf(stderr,"Memory allocation error at location 1.\n");
		exit(1);
	}	
	for(i=1;i<=num_grp;i++){
		if((HG[i].a=(int*)calloc(num_sp,sizeof(int)))==NULL){
			fprintf(stderr,"Memory allocation error at location 2.\n");
			exit(1);
		}
		if((HG[i].b=(int*)calloc(num_pat_type,sizeof(int)))==NULL){
			fprintf(stderr,"Memory allocation error at location 3.\n");
			exit(1);
		}
		if((HG[i].c=(int*)calloc(num_pat_type,sizeof(int)))==NULL){
			fprintf(stderr,"Memory allocation error at location 4.\n");
			exit(1);
		}
	}

	if((sp=(species*)calloc(num_sp,sizeof(species)))==NULL){
		fprintf(stderr,"Momory allocation error at location 5.\n");
		exit(1);
	}
	if(readTable()) exit(1);
	computePattern();
	printPatternTable();	
	printSummary();

	return FALSE;
}
