/*******************************************
node.c
GCLUST
previously a part of lsort3.h 
This program reads a list file (generated from BLAST result by bl2ls.pl),
and assembles similarity groups. Output is a *.grp file.
This is a C version of the lsort2b.pl.
  Copyright Naoki Sato 2002.

Added functionality of using catenated list file. June 2, 2002.
Added msort, lnkann. June 16, 2002.
Added SQlist. June 17, 2002.
Added lnkdb. June 25, 2002.
Selection of master entry. June 25, 2002.
Added hom. July 2, 2002.
Matrix is now in float. July 5, 2002.
Matrix is now in int. July 6, 2002.
score is now in float. July 6, 2002. Now double, May 13, 2003.
Added sublist5. July 7, 2002.
Phase 1. July 19, 2002.
RemoveUnrelated returns unsigned. Sept. 5, 2002.
Unnecessary output suppressed. Dec. 30, 2002.
Reconstruction of source code. April 30, 2003.
Gclust3 format. Dec. 25, 2003.
*******************************************/

#include "defines.h"

void SetGrpNo(Node **a,unsigned nodes);
void AssignOrg(Node **a,unsigned nodes);
org_list *ReadOrgFile(FILE *fin);
char *OrgPrefix(int s);
void InitDomain(Node **a, unsigned nodes);
int ReadVariables2(FILE *fin);
void SetItem0(Node **a,unsigned nodes);
Node **ReadData(FILE *fin,Node **a,unsigned *nodes,char *listfile,char *table);
Node **InitNode(unsigned i);
Node **ReallocNode(Node **a,unsigned i1,unsigned i2);
Node **ReadNode2(FILE *fin,Node **a,unsigned *nodes);
Node **WriteNode(Node **a,unsigned i,unsigned *nodes_p,char *line1);
Node **ReadAnnot2(Node **a,unsigned *nodes_p,FILE *ftable);
unsigned FindNode(Node **a,unsigned nodes,unsigned i);
void OptimizeNode(Node **a,unsigned nodes);
void CheckNode(Node **a,unsigned nodes);
Node **AssignSingletons(Node **a,unsigned nodes);
unsigned *MakeTable(Node **a,unsigned n);
void SelectNode2(Node **a,unsigned nodes);
void RecoverNode(Node **a,unsigned nodes);
unsigned GetMax_n4(Node **a,unsigned nodes);
unsigned GetMax_n3(Node **a,unsigned nodes);
unsigned GetMax_n3i(Node **a,unsigned nodes);
void SortList(Node **a,unsigned nodes,Boolean flag);
void qSortList(Node **a,unsigned L,unsigned r,Boolean flag);
int partition(Node **a,unsigned L,unsigned r);
int partition2(Node **a,unsigned L,unsigned r);
Node **ReadNodeM8(FILE *fin,Node **a,unsigned *nodes);
Node **WriteNodeM8(Node **a,unsigned i,unsigned *nodes_p,char *line1);

#ifndef __NODE
#define __NODE
#endif
#include "gclust.h"


/*********************************************************/
void SetGrpNo(Node **a,unsigned nodes)
/*********************************************************/
{
	unsigned ct,i;

	ct = 0;
	for(i=1;i<=nodes;i++){
		if(a[i]->n3 != 0){
			ct += 1;
			a[i]->grpno = ct;
		}
	}
}
/*********************************************************/


/***************************************************************/
void InitDomain(Node **a, unsigned nodes)
/* initializes dmlist, if domain == 0. */
/***************************************************************/
{
	unsigned i;

	for(i=1;i<=nodes;i++){
		if(a[i]->domain!=0 || !a[i]->active) continue;
		a[i]->DMnum = 1;
		a[i]->DMmax = 1;
		a[i]->dmlist[0].nodeID = a[i]->n0;
		a[i]->dmlist[0].start = 1;
		a[i]->dmlist[0].end = a[i]->len;
		a[i]->dmlist[0].ID = 1;
	}

	return;
}
/***************************************************************/


/***************************************************************/
unsigned GetMax_n4(Node **a,unsigned nodes)
/***************************************************************/
{
	unsigned i;
	unsigned mx=0;

	for(i=1;i<=nodes;i++){
		if(a[i]->n4 > mx) mx = a[i]->n4;
	}

	return mx;
}
/***************************************************************/


/***************************************************************/
unsigned GetMax_n3(Node **a,unsigned nodes)
/***************************************************************/
{
	unsigned i;
	unsigned mx=0;

	for(i=1;i<=nodes;i++){
		if(a[i]->domain != 0) continue;
		if(a[i]->n3 > mx) mx = a[i]->n3;
	}

	return mx;
}
/***************************************************************/


/***************************************************************/
unsigned GetMax_n3i(Node **a,unsigned nodes)
/***************************************************************/
{
	unsigned i;
	unsigned mx=0;
	unsigned mxi=0;

	for(i=1;i<=nodes;i++){
		if(a[i]->domain != 0) continue;
		if(a[i]->n3 > mx){
			mx = a[i]->n3;
			mxi = i;
		}
	}

	return mxi;
}
/***************************************************************/



/***************************************************************/
void SetItem0(Node **a,unsigned nodes)
/* This subroutine sets the first item of each node.
The first item describes the self homology. Normally, only
a single such item is present in the list, unless the self 
homology is below the initial threshold. */
/***************************************************************/
{
	unsigned i,j;
	
	for(i=1;i<=nodes;i++){
		if(a[i]->sqlist1[0].ID == a[i]->n0){
			/* Any item describing the self homology is accepted.*/
			if(a[i]->sqlist1[0].score <= 1.0) continue;
			if(a[i]->n1 < 2) continue;
		}
		for(j=1;j<a[i]->n1;j++){
			if(a[i]->sqlist1[j].ID == a[i]->n0){
				swapSQlist(&a[i]->sqlist1[0],&a[i]->sqlist1[j]);
				break;	/* only a single such item is sufficient.*/
			}
		}
	}
}
/***************************************************************/


/*******************************************/
Node **InitNode(unsigned i)
/* this command just creates a[0] to a[i].*/
/*******************************************/
{
	Node **a;
	unsigned j;

	if(i<1){
		fprintf(stderr, "Node member must be larger than or equal to 1 in InitNode.\n");
		return NULL;
	}

	if((a=(Node**)calloc(i,sizeof(Node)))==NULL){
		fprintf(stderr, "Memory allocation error in InitNode 2\n.");
		return NULL;
	}

	for(j=0; j<i; j++){	
		if((a[j]=(Node*)calloc(1,sizeof(Node)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 3\n.");
			return NULL;
		}
		strcpy(a[j]->name,"");
		strcpy(a[j]->annot,"");
		a[j]->len=0;
		a[j]->corr_length=0;
		a[j]->active=TRUE;
		a[j]->n0=j;
		a[j]->n1=0;
		a[j]->n1b=0;
		a[j]->n2=0;
		a[j]->n2b=0;
		a[j]->len3=1;
		a[j]->n3=0;
		a[j]->n3_old=0;
		a[j]->n3b=0;
		a[j]->n30=0;
		a[j]->n4=0;
		a[j]->n5=0;
		a[j]->n6=0;
		a[j]->n7=0;
		a[j]->grpno = 0;
		a[j]->subgrpno = 0;
		a[j]->domain = 0;
		a[j]->DMnum = 0;
		a[j]->DMmax = 0;
		a[j]->final_thr=1.0;

		if((a[j]->sqlist1=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 4\n.");
			return NULL;
		}
		if((a[j]->sqlist2=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 5\n.");
			return NULL;
		}
		if((a[j]->sqlist3=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 6\n.");
			return NULL;
		}
		if((a[j]->reflist5=(unsigned*)calloc(1,sizeof(unsigned)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 7\n.");
			return NULL;
		}
		if((a[j]->sublist6=(unsigned*)calloc(1,sizeof(unsigned)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 8\n.");
			return NULL;
		}
		if((a[j]->dmlist=(DMlist*)calloc(1,sizeof(DMlist)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 9\n.");
			return NULL;
		}
		clearSQlist(&a[j]->sqlist1[0]);
		clearSQlist(&a[j]->sqlist2[0]);
		clearSQlist(&a[j]->sqlist3[0]);
	}
	return a;
}
/***************************************************************/

/**************************************************************/
Node **ReallocNode(Node **a,unsigned i1,unsigned i2)
/************ i1 is current value of nodes. *******************/
/**************************************************************/
{
	unsigned j;
	unsigned s;

	fprintf(stderr,"ReallocNode is called with i1=%u, i2=%u.\n",i1,i2);
	if(i2<1){
		fprintf(stderr, "Node member must be larger than or equal to 1 in ReallocNode.\n");
		return NULL;
	}
	s=i1+i2+1;
	if((a=(Node**)realloc(a,s*sizeof(Node)))==NULL){
		fprintf(stderr, "Memory allocation error in ReallocNode 2\n.");
		return NULL;
	}

	for(j=i1+1; j<s; j++){	
		if((a[j]=(Node*)calloc(1,sizeof(Node)))==NULL){
			fprintf(stderr, "Memory allocation error in InitNode 3\n.");
			return NULL;
		}
		strcpy(a[j]->name,"");
		strcpy(a[j]->annot,"");
		a[j]->len=0;
		a[j]->corr_length=0;
		a[j]->active=TRUE;
		a[j]->n0=j;
		a[j]->n1=0;
		a[j]->n1b=0;
		a[j]->n2=0;
		a[j]->n2b=0;
		a[j]->len3=1;
		a[j]->n3=0;
		a[j]->n3_old=0;
		a[j]->n3b=0;
		a[j]->n30=0;
		a[j]->n4=0;
		a[j]->n5=0;
		a[j]->n6=0;
		a[j]->n6=0;
		a[j]->n7=0;
		a[j]->grpno = 0;
		a[j]->subgrpno = 0;
		a[j]->domain = 0;
		a[j]->DMnum = 0;
		a[j]->DMmax = 0;
		a[j]->final_thr=1.0;

		if((a[j]->sqlist1=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 4\n.");
			return NULL;
		}
		if((a[j]->sqlist2=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 5\n.");
			return NULL;
		}
		if((a[j]->sqlist3=(SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 6\n.");
			return NULL;
		}
		if((a[j]->reflist5=(unsigned*)calloc(1,sizeof(unsigned)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 7\n.");
			return NULL;
		}
		if((a[j]->sublist6=(unsigned*)calloc(1,sizeof(unsigned)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 8\n.");
			return NULL;
		}
		if((a[j]->dmlist=(DMlist*)calloc(1,sizeof(DMlist)))==NULL){
			fprintf(stderr, "Memory allocation error in ReallocNode 9\n.");
			return NULL;
		}
		clearSQlist(&a[j]->sqlist1[0]);
		clearSQlist(&a[j]->sqlist2[0]);
		clearSQlist(&a[j]->sqlist3[0]);
	}
	return a;
}
/***************************************************************/

/**************************************************************/
Node **WriteNode(Node **a,unsigned i,unsigned *nodes_p,char *line1)
/* Note that line1 is delimited by tabs rather than spaces. */
/* Now a new line with existing name is added to the list of
the existing Node. 
Now, thr is used in reading data.*/
/**************************************************************/
{
	char str1[MAXLEN];
	char str2[MAXLEN];
	char str12[MAXLEN];
	char str11[MAXLEN];
	char *p;
	unsigned num=0;		/* number of items in line1 */
	Boolean flag=FALSE;
	unsigned count=0;
	unsigned k;
	unsigned curr_n1=0;
	unsigned num1=0;
	unsigned offset=0;
	double tmp=0.0;
	Boolean to_skip=FALSE;
	Boolean addition_mode=FALSE;	/* The first item has been set */

	if(i>*nodes_p){
		fprintf(stderr, "Node member must be less than nodes in WriteNode.\n");
		return NULL;
	}
	if(a[i]->sqlist1[0].ID != 0) addition_mode=TRUE;

	sscanf(line1,"%s",str1);
	if(!strcmp(a[i]->name,"") || (!gclust3 && strcmp(a[i]->name,"") && strcmp(a[i]->name,str1))){
		fprintf(stderr, "Inconsistent node name in WriteNode.\n");
		fprintf(stderr, "a[i]->name= %s.\n",a[i]->name);
		fprintf(stderr, "line1= %s.\n",line1);
		return NULL;
	}

	for(p=line1;*p!='\0';p++){
		if(*p!=' ' && *p!='\t' && *p!='\n'){
			if(flag) continue;
			else {
				flag=TRUE;
				num++;
			}
		} else {
			flag=FALSE;
		}
	}
	if(num % 4 != 3) {
		fprintf(stderr, "Incorrect item number in the list. num= %u\n",num);
		fprintf(stderr, "line1= %s.\n",line1);
		return NULL;
	}
		
	curr_n1 = a[i]->n1;
	num = (num + 1)/4;

	if(strcmp(a[i]->name,"")) offset = 1;	/* Now this is always true */
	num1 = num - offset;		/* num1 is the number of items to be added. */
	a[i]->n1 += num1;
	if(num1!=0){
		if((a[i]->sqlist1=(SQlist*)realloc(a[i]->sqlist1,(a[i]->n1+1)*sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in WriteNode\n.");
			return NULL;
		}
		for(k=curr_n1+1;k<=a[i]->n1;k++){
			clearSQlist(&a[i]->sqlist1[k]);
		}
	}

	p=line1;
	while(sscanf(p,"%s",str1)!=EOF){
		p+=strlen(str1);
		while((*p==' ' || *p=='\t') && *p!='\0' && *p!='\n'){p++;}
		if(count==0) {
/* this has been set in InitNode 
			a[i]->n0 = i;
*/
			if(!addition_mode){
				a[i]->sqlist1[0].ID = i;
				a[i]->sqlist1[0].qID = i;
				a[i]->sqlist1[0].Sstart = 1;
				a[i]->sqlist1[0].Qstart = 1;
/* this has been set in clearSQlist called from InitNode 
				a[i]->sqlist1[0].score = 1.0;
*/
			}
		} else if(count==1){
/* this is necessary whether addition_mode is true or not */
			if(a[i]->len != (unsigned)atol(str1)){
				if(a[i]->len == 0) a[i]->len = (unsigned)atol(str1);
				else {
					fprintf(stderr,"\nSequence length is inconsistent in entry %s.\n",a[i]->name);	
					fprintf(stderr,"The value %u in the table was changed to %s\n",a[i]->len,str1);
					a[i]->len = (unsigned)atol(str1);
				}
			}
			if(a[i]->len > _largeprotein){
				a[i]->domain |= 0x4;
				printf("Node %u: %s is being set as a large protein. Size=%u\n",i,a[i]->name,a[i]->len);
			}
			a[i]->sqlist1[0].Send = a[i]->len;
			a[i]->sqlist1[0].Qend = a[i]->len;
		} else if(count==2){
/* this is valid whether addition_mode is true or not */
			if(!strcmp(a[i]->DBname,"")){
				strncpy(a[i]->DBname,str1,28);
			}
		} else if(count % 4 == 3){
			to_skip = FALSE;
			if(gclust3){
				if(*str1 == 'g'){
					k = atol(str1+1);
					a[i]->sqlist1[curr_n1+(count+1)/4-offset].ID=k;
				} else {
					fprintf(stderr,"This is not in the Gclust3 format.\n");
					exit(1);
				}
			} else {
				for(k=1;k<=*nodes_p;k++){
					if(!strcmp(a[k]->name,str1)){
						a[i]->sqlist1[curr_n1+(count+1)/4-offset].ID=k;
						break;
					}
				}
				if(k==*nodes_p + 1) {
					fprintf(stderr,"The name %s is not found in the table.\n",str1);
					fprintf(stderr,"%s\n",line1);
					to_skip = TRUE;
				}
			}
			a[i]->sqlist1[curr_n1+(count+1)/4-offset].qID=i;

		} else if(count % 4 == 0){
/*			tmp = atof(str1);
*/
			/* Perl does not print the first number when the value is less than 1e-100 */
			if(*str1 == 'e' || *str1 == 'E'){
				strcpy(str2,"1");
				strcat(str2,str1);
				strcpy(str1,str2);
				strcpy(str2,"");
			}
			sscanf(str1,"%lf",&tmp);
			if(tmp >= thr * 0.999) to_skip=TRUE;
			else a[i]->sqlist1[curr_n1+count/4-offset].score = tmp;
		} else if(count % 4 == 1){
			if(strchr(str1,'-')!=NULL) *strchr(str1,'-') = ' ';
			sscanf(str1,"%s %s",str11,str12);
			a[i]->sqlist1[curr_n1+count/4-offset].Sstart = (unsigned)atol(str11);
			a[i]->sqlist1[curr_n1+count/4-offset].Send = (unsigned)atol(str12);
		} else if(count % 4 == 2){
			if(strchr(str1,'-')!=NULL) *strchr(str1,'-') = ' ';
			sscanf(str1,"%s %s",str11,str12);
			a[i]->sqlist1[curr_n1+count/4-offset].Qstart = (unsigned)atol(str11);
			a[i]->sqlist1[curr_n1+count/4-offset].Qend = (unsigned)atol(str12);
			if(to_skip) {
				count -= 4;
				a[i]->n1 -= 1;
			}
			tmp = 0.0;
		}
		count++;
	}

/* This part should be done once after the ReadNode2 is finished. */
/* If no valid homology entry is found, n1 is set to 1. / 
	if(a[i]->n1==0 && strcmp(a[i]->name,"") && a[i]->sqlist1[0].Sstart != 0) a[i]->n1=1;

/ If there is really no valid entry, an alert is announced. 
This might not announced because the validity of the entry is checked 
before this subroutine is called within ReadNode2. /	
	if(a[i]->n1 < 1) printf("a[%u]->n1=%u\n",i,a[i]->n1);
*/
	return a;
}
/***************************************************************/


/*******************************************/
void SelectNode2(Node **a,unsigned nodes)
/* This sets n3 in nodes in which n4 is largest. */
/*******************************************/
{
	unsigned i,j,k,n4,m,n4k;

	printf("Selecting nodes...\n");

	/* 350 clique mode */

	if(clique_mode){
		for(i=1;i<=nodes;i++){
			if(!a[i]->active) continue;
			n4 = a[i]->n4;
			if(n4==0 || a[i]->n3 == 0) continue;
			for(j=0;j<n4;j++){
				k=FindNode(a,nodes,a[i]->idlist4[j]);
				if(k==i || !a[k]->active) continue;	
				if(a[k]->n3 != 0 && k <  i){
					n4k = a[k]->n4;
					for(m=0;m<n4k;m++){
						if(a[k]->idlist4[m] == a[i]->n0){
							fprintf(stderr,"Node %u has been finished.\n",a[i]->n0);
							a[i]->n3b = a[i]->n3;
							a[i]->n3 = 0;
							break;
						}
					}
				}
			}
		}
		fflush(stdout);
		fflush(stderr);
		return;
	}




	/* normal or repeat modes */

	RecoverNode(a,nodes);

	for(i=1;i<=nodes;i++){
		if(!a[i]->active) continue;		/* 308 */
		n4 = a[i]->n4;
		if(n4==0 || a[i]->n3 == 0) continue;
		if(a[i]->final_thr < thr){
			for(j=0;j<n4;j++){
				k=FindNode(a,nodes,a[i]->idlist4[j]);
				if(k==i || !a[k]->active) continue;		/* 308 */
				if(a[k]->final_thr >= thr){
					fprintf(stderr,"Node %u has not been a member of a cluster %u.\n",a[k]->n0,a[i]->n0);
					continue;
				}
				if(k < i){
					fprintf(stderr,"Node %u has been finished.\n",a[i]->n0);
					a[i]->n3 = 0;
					break;
				}
				a[k]->n3 = 0;
			}
		}
	}

	fflush(stdout);
	for(i=1;i<=nodes;i++){
		if(!a[i]->active) continue;		/* 308 */
		n4 = a[i]->n4;
		if(n4==0 || a[i]->n3 == 0) continue;
		if(a[i]->final_thr >= thr){
			for(j=0;j<n4;j++){
				k=FindNode(a,nodes,a[i]->idlist4[j]);
				if(k==i || !a[k]->active) continue;	/* 308 */
				if(a[k]->final_thr < thr){
					fprintf(stderr,"Node %u is a member of a cluster %u.\n",a[k]->n0,a[i]->n0);
					continue;
				}
				if(k < i){
					fprintf(stderr,"Node %u has been finished.\n",a[i]->n0);
					a[i]->n3 = 0;
					break;
				}
				a[k]->n3 = 0;
			}
		}
	}

	return;
}
/*******************************************/

/*******************************************/
void RecoverNode(Node **a,unsigned nodes)
/*******************************************/
{
	unsigned i;

	for(i=1;i<=nodes;i++){
		if(a[i]->final_thr < 1.0) continue;
		if(a[i]->n3==0) a[i]->n3 = a[i]->n3b;
	}
	return;
}
/*******************************************/

/*************************************************/
void SortList(Node **a,unsigned nodes,Boolean flag)
/* if flag is TRUE, sort according to n4 */
/*************************************************/
{
	printf("Sorting list.\n");

	qSortList(a,1,nodes,flag);

	printf("   Sort complete.\n");
	fflush(stdout);
	return;
}
/**************************************/

/**********************************************************/
void qSortList(Node **a,unsigned L,unsigned r,Boolean flag)
/**********************************************************/
{
	unsigned v;

	if(L>=r) return;

	if(flag) v=partition2(a,L,r);
	else v=partition(a,L,r);
	
	qSortList(a,L,v-1,flag);

	qSortList(a,v+1,r,flag);

	return;
}
/***********************************************/

/*********************************************************/
int partition(Node **a,unsigned L,unsigned r)
/*********************************************************/
{
	unsigned i,j,pivot;
	Node *tmp;

	i=L-1;
	j=r;
	pivot=a[r]->n3;

	while(TRUE){
		while(a[++i]->n3>pivot);
		while(i< --j && pivot>a[j]->n3);
		if(i >= j) break;
		tmp = a[i];
		a[i] = a[j];
		a[j] = tmp;
	}
	tmp = a[i];
	a[i] = a[r];
	a[r] = tmp;

	return i;
}
/********************************************/

/********************************************/
int partition2(Node **a,unsigned L,unsigned r)
/********************************************/
{
	unsigned i,j,pivot,pivot2;
	Node *tmp;

	i=L-1;
	j=r;
	pivot=a[r]->n4;
	pivot2=a[r]->n3;

	while(TRUE){

		while(a[++i]->n4 > pivot || (a[i]->n4 == pivot && a[i]->n3 > pivot2));
		while(i < --j && (pivot > a[j]->n4 || (pivot == a[j]->n4 && pivot2 > a[j]->n3)));

		if(i >= j) break;
		tmp = a[i];
		a[i] = a[j];
		a[j] = tmp;
	}
	tmp = a[i];
	a[i] = a[r];
	a[r] = tmp;

	return i;
}
/********************************************/

/***************************************************/
void AssignOrg(Node **a,unsigned nodes)
/***************************************************/
{
	unsigned i,j;
	char prefix[20];
	char temp[100];

	for(i=1;i<=nodes;i++){
		strcpy(temp,a[i]->name);
		strcpy(prefix,strtok(temp,"_"));
		for(j=1;j<=num_entries;j++){
			if(!strcmp(prefix,org[j].prefix)) break;
		}
		if(j<=num_entries){
			a[i]->species = org[j].species;
			a[i]->kingdom = org[j].kingdom;
		} else {
			fprintf(stderr,"Organism prefix %s not found in org_list. Exiting ...\n",prefix);
			exit(1);
		}

	}	
}
/***************************************************/

/***************************************************/
char *OrgPrefix(int s)
/* This function returns the name of organism
 * that corresponds to species number s. */
/***************************************************/
{
	int t;
	char *empty = " ";

	if(s < 0 || s > num_org_all) return empty;
	for(t=0;t<num_entries;t++){
		if(org[t].species == s){
			return org[t].prefix;
		}
	}
	return empty;
}
/***************************************************/


/***************************************************/
org_list *ReadOrgFile(FILE *fin)
/* organism definition is read from file org_list.  
*/
/***************************************************/
{
	char str1[MAXLINE + 2];
	char org_list_file[20];
	char prefix[20];
	char name[100];
	int kingdom,species;
	int i;
	org_list *org;
	char *p;

	strcpy(org_list_file,ORG_LIST_FILE);
	if(fgets(str1,MAXLINE,fin)==NULL){
		fprintf(stderr,"Unable to read org_list.\n");
		return NULL;
	}
	sscanf(str1,"%d\t%d",&num_entries,&num_org_all);
	if((org = (org_list*)calloc(num_entries + 2,sizeof(org_list)))==NULL){
		fprintf(stderr,"Memory allocation error in ReadOrgFile.\n");
		return NULL;
	}
	
	i=0;
	while(fgets(str1,MAXLINE,fin) != NULL && strncmp(str1,"END",3)){
		for(p=str1;*p!='\0';p++){
			if(*p == ' ') *p = '_';
		}
		if(*str1 == '#') continue;	/* comment line, May 29, 2007 */
		if(strcmp(str1,"")==0) continue;
		if(i++ >= num_entries) break;
		sscanf(str1,"%s\t%s\t%d\t%d",prefix,name,&species,&kingdom);
/*		printf("%d ",i);
 */
		for(p=name;*p!='\0';p++){
			if(*p == '_') *p = ' ';
		}
		if(i>0 && i<=num_entries){
			strcpy(org[i].prefix,prefix);
			strcpy(org[i].name,name);
			org[i].species = species;
			org[i].kingdom = kingdom;
/*			printf("%s\t%s\t%d\t%d\n",org[i].prefix,org[i].name,i,org[i].kingdom);
*/
		}
	}
	num_entries = i;

	return org;
}
/***************************************************/



/***************************************************/
int ReadVariables2(FILE *fin)
/* Saved variables are read.  
The first line should begin with 
GCLUST_VARIABLES 
Now reading of thr is enabled*/
/***************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char gclust_variables[20];
	int i;

	strcpy(gclust_variables,GCLUST_VARIABLES);
	if(fgets(str1,MAXLINE,fin)==NULL){
		fprintf(stderr,"Unable to read data.\n");
		return TRUE;
	}
	sscanf(str1,"%s",str2);
 	if(strcmp(str2,gclust_variables)){
		fprintf(stderr,"Incorrect data format.\n");
		return TRUE;
	}

	while(fgets(str1,MAXLINE,fin) != NULL && strncmp(str1,"END",3)){
		/* 352f9 comment line is enabled */
		if(strncmp(str1,"#",1)==0 || strncmp(str1,";",1)==0 || strncmp(str1,"//",2)==0) continue;
		if(strstr(str1,"largeprotein")!=NULL) sscanf(str1,"%u",&_largeprotein);
		else if(strstr(str1,"max_n3")!=NULL) sscanf(str1,"%u",&_max_n3);
		else if(strstr(str1,"max_matrix")!=NULL) sscanf(str1,"%u",&_max_matrix);
		else if(strstr(str1,"level1")!=NULL) sscanf(str1,"%le",&_level1);
		else if(strstr(str1,"level2")!=NULL) sscanf(str1,"%le",&_level2);
		else if(strstr(str1,"level3")!=NULL) sscanf(str1,"%le",&_level3);
		else if(strstr(str1,"level4")!=NULL) sscanf(str1,"%le",&_level4);
		else if(strstr(str1,"matchlevel")!=NULL) sscanf(str1,"%le",&_matchlevel);
		else if(strstr(str1,"minlevel")!=NULL) sscanf(str1,"%le",&_minlevel);
		else if(strstr(str1,"min_filled")!=NULL){
			if(strstr(str1,"min_filled2")!=NULL) sscanf(str1,"%le",&_min_filled2);
			else sscanf(str1,"%le",&_min_filled);
		}
		else if(strstr(str1,"min_zeros")!=NULL) sscanf(str1,"%u",&_min_zeros);
		else if(strstr(str1,"min_0_occup")!=NULL) sscanf(str1,"%le",&_min_0_occup);
		else if(strstr(str1,"number_of_genomes")!=NULL) sscanf(str1,"%u",&number_of_genomes);
		else if(strstr(str1,"suppress_large_matrix")!=NULL) sscanf(str1,"%u",&suppress_large_matrix);
		else if(strstr(str1,"stack_size")!=NULL) sscanf(str1,"%u",&_stack_size);
		else if(strstr(str1,"min_domain_size")!=NULL) sscanf(str1,"%u",&_min_domain_size);
		else if(strstr(str1,"searchbridge_min_triangle")!=NULL) sscanf(str1,"%le",&_searchbridge_min_triangle);
		else if(strstr(str1,"searchbridge_min_n4")!=NULL) sscanf(str1,"%u",&_searchbridge_min_n4);
		else if(strstr(str1,"levelF1")!=NULL) sscanf(str1,"%le",&_f1);
		else if(strstr(str1,"levelF2")!=NULL) sscanf(str1,"%le",&_f2);
		else if(strstr(str1,"levelF3")!=NULL) sscanf(str1,"%le",&_f3);
		/* 353h various parameters are added */
		else if(strstr(str1,"min_overlap")!=NULL) sscanf(str1,"%d",&_min_overlap);
		else if(strstr(str1,"delta_diff")!=NULL) sscanf(str1,"%d",&_delta_diff);
		else if(strstr(str1,"delta_plus")!=NULL) sscanf(str1,"%d",&_delta_plus);
		else if(strstr(str1,"min_delta_overlap")!=NULL) sscanf(str1,"%le",&_min_delta_overlap);
		else if(strstr(str1,"org_measure_0")!=NULL) sscanf(str1,"%le",&_org_measure_0);
		else if(strstr(str1,"org_measure_1")!=NULL) sscanf(str1,"%le",&_org_measure_1);
		else if(strstr(str1,"allowance_level_0")!=NULL) sscanf(str1,"%le",&_allowance_level_0);
		else if(strstr(str1,"allowance_level_1")!=NULL) sscanf(str1,"%le",&_allowance_level_1);
		else if(strstr(str1,"JS_level_0")!=NULL) sscanf(str1,"%le",&_JS_level_0);
		else if(strstr(str1,"JS_level_1")!=NULL) sscanf(str1,"%le",&_JS_level_1);
		else if(strstr(str1,"MDR_level_0")!=NULL) sscanf(str1,"%le",&_MDR_level1_0);
		else if(strstr(str1,"MDR_level_1")!=NULL) sscanf(str1,"%le",&_MDR_level1_1);
		else if(strstr(str1,"MDR_max_cluster_0")!=NULL) sscanf(str1,"%le",&_MDR_max_cluster_0);
		else if(strstr(str1,"MDR_max_cluster_1")!=NULL) sscanf(str1,"%le",&_MDR_max_cluster_1);
		else if(strstr(str1,"MDR_length1")!=NULL) sscanf(str1,"%d",&_MDR_length1);
		else if(strstr(str1,"MDR_length2")!=NULL) sscanf(str1,"%d",&_MDR_length2);
		else if(strstr(str1,"MDR_score1")!=NULL) sscanf(str1,"%le",&_MDR_score1);
		else if(strstr(str1,"MDR_score2")!=NULL) sscanf(str1,"%le",&_MDR_score2);
		else if(strstr(str1,"MDR_clique_size")!=NULL) sscanf(str1,"%d",&_MDR_clique_size);
		else if(strstr(str1,"cross_edge_parameter")!=NULL) sscanf(str1,"%le",&_cross_edge_parameter);
		else if(strstr(str1,"CC_level_2")!=NULL) sscanf(str1,"%le",&_CC_level_2);
		else if(strstr(str1,"CC_level_3")!=NULL) sscanf(str1,"%le",&_CC_level_3);
		else if(strstr(str1,"CC_min_thr")!=NULL) sscanf(str1,"%le",&_CC_min_thr);

		else if(strstr(str1,"num_thr")!=NULL){ 
			sscanf(str1,"%d",&num_thr);
			if(num_thr > 45) num_thr=45;
			if(num_thr<=0) return FALSE;
			for(i=0;i<num_thr;i++){
				if(fgets(str1,MAXLINE,fin) == NULL || !strncmp(str1,"END",3)){
					num_thr = i;
					break;
				}
				sscanf(str1,"%le",&thr_list[i]);
			}
			break;
		}
		else continue;
	}
		
	return FALSE;
}
/***************************************************/


/***************************************************/
Node **ReadData(FILE *fin,Node **a,unsigned *nodes_p,char *listfile,char *table)
/* Saved data are read.  
The first line should begin with 
GCLUST_DATA <TAB> <number of nodes>  */
/***************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char str3[MAXLINE + 2];
	char str4[MAXLINE + 2];
	char str5[MAXLINE + 2];
	unsigned ct=0;
	unsigned len=2*MAXLINE;	/* length of line1 */
	unsigned i,j,i1;
	char gclust_data[20];
	SQlist *b;
	DMlist *dm;
	double thr_data;
	unsigned dummy_n4=0;
	char allowed_version[20];


	strcpy(allowed_version,"3.5.0.a9");

	strcpy(gclust_data,GCLUST_DATA);
	ct=0;
	if(fgets(str1,MAXLINE,fin)==NULL){
		fprintf(stderr,"Unable to read data.\n");
		return NULL;
	}
	sscanf(str1,"%s\t%s\t%s\t%s",str2,str3,str4,str5);
 	if(strcmp(str2,gclust_data)){
		fprintf(stderr,"Incorrect data format.\n");
		return NULL;
	}
 	if(strcmp(str4,"version")){
		fprintf(stderr,"Old data format before version 3.5.0.\n");
		return NULL;
	} else if(strcmp(str5,allowed_version) < 0){
			fprintf(stderr,"Old data format.\n");
			return NULL;
	}

	*nodes_p = (unsigned)atol(str3);
	if((a=InitNode(*nodes_p + 1))==NULL){
		exit(1);
	}
	j=0;

	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%s",listfile);
	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%s",table);
	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%s",str2);
	thr_data=atof(str2);
	if(thr_data < thr) thr=thr_data;

	for(i=1;i<=*nodes_p;i++){
		fgets(str1,MAXLINE,fin);
		sscanf(str1,"%s %u",str2,&i1);
		if(strcmp(str2,"Node") || i!=i1){
			fprintf(stderr,"Incorrect data format in node %u, item Node.\n",i);
			return NULL;
		}

		fgets(str1,MAXLINE,fin);
		sscanf(str1,"%s %u %s",str2,&len,str3);
		strcpy(a[i]->name,str2);
		a[i]->len=len;
		strcpy(a[i]->annot,str3);
		if(len > _largeprotein){
			a[i]->domain |= 0x4;
			if(verbous_mode) printf("Node %u: %s is being set as a large protein. Size=%u\n",i,a[i]->name,len);
		}

		fgets(str1,MAXLINE,fin);
		sscanf(str1,"%s %s %s %s",str2,str3,str4,str5);
		if(strcmp(str2,"N0")){
			fprintf(stderr,"Incorrect data format in node %u, item N0.\n",i);
			return NULL;
		}

	/* 308 and 350 domain information is included in data.out file. */
		a[i]->n0 = (unsigned)atol(str3);
		if(!strcmp(str4,"Domain")){
			a[i]->domain = atoi(str5);
			if(a[i]->domain != 0){
				if(verbous_mode) printf("a[%u]->domain is set to %d\n",i,a[i]->domain);
				a[i]->active = FALSE;
			}

		/* domain list */

			fgets(str1,MAXLINE,fin);
			sscanf(str1,"%s %s",str2,str3);
			if(strcmp(str2,"maskID")){
				fprintf(stderr,"Incorrect data format in node %u, item maskID.\n",i);
				fprintf(stderr,"But we continue ..\n");
			} else {
				a[i]->maskID = (unsigned long)atol(str3);
				fgets(str1,MAXLINE,fin);
			}
			
			sscanf(str1,"%s %s",str2,str3);
			if(strcmp(str2,"DMnum")){
				fprintf(stderr,"Incorrect data format in node %u, item DMnum.\n",i);
				return NULL;
			}
			a[i]->DMmax = a[i]->DMnum = (unsigned)atol(str3);
			if((a[i]->dmlist=(DMlist*)calloc((a[i]->DMnum+1),sizeof(DMlist)))==NULL){
				fprintf(stderr, "Memory allocation error for DMlist in ReadData\n.");
				return NULL;
			}
			for(j=0;j<a[i]->DMnum;j++){
				dm=&a[i]->dmlist[j];
				fgets(str1,MAXLINE,fin);
				sscanf(str1,"%u %u %lu",&dm->start,&dm->end,&dm->ID);
				a[i]->dmlist[j].nodeID = i;
			}

		/* n3 and sqlist3 */

			fgets(str1,MAXLINE,fin);
			sscanf(str1,"%s %s",str2,str3);
			if(strcmp(str2,"N3")){
				fprintf(stderr,"Incorrect data format in node %u, item N3.\n",i);
				return NULL;
			}
			a[i]->n3 = (unsigned)atol(str3);
			if((a[i]->sqlist3=(SQlist*)realloc(a[i]->sqlist3,(a[i]->n3+1)*sizeof(SQlist)))==NULL){
				fprintf(stderr, "Memory allocation error for n3 in ReadData\n.");
				return NULL;
			}
			for(j=0;j<a[i]->n3;j++){
				b=&a[i]->sqlist3[j];
				fgets(str1,MAXLINE,fin);
				sscanf(str1,"%u %u %u %s %u %u %u",&b->ID, &b->Sstart, &b->Send, str2, &b->qID,\
					&b->Qstart, &b->Qend);
				b->score = atof(str2);
			}
			if(a[i]->n3 > _max_matrix){
				a[i]->domain |= 0x04;	/* Now, this is set to large protein rather than multidomain. 355 */
				printf("a[%u]->domain is set to %d because n3 is larger than max_matrix.\n",i,a[i]->domain);
				fflush(stdout);
				a[i]->active = FALSE;
			}
				

		/* n4 and idlist4 */
	
			fgets(str1,MAXLINE,fin);
			sscanf(str1,"%s %s",str2,str3);
			if(strcmp(str2,"N4")){
				fprintf(stderr,"Incorrect data format in node %u, item N4.\n",i);
				return NULL;
			}
			dummy_n4 = (unsigned)atol(str3);
/*			if(a[i]->domain & 4 || !clique_mode){
*/
			if(a[i]->domain & 0x4 || clique_mode){
				a[i]->n4 = dummy_n4;
				if((a[i]->idlist4=(unsigned*)calloc(a[i]->n4+1,sizeof(unsigned)))==NULL){
					fprintf(stderr, "Memory allocation error for n4 in ReadData\n.");
					return NULL;
				}
				for(j=0;j<a[i]->n4;j++){
					fgets(str1,MAXLINE,fin);
					sscanf(str1,"%u",&(a[i]->idlist4[j]));
				}
			} else {
				a[i]->n4 = 1;
				if((a[i]->idlist4=(unsigned*)calloc(a[i]->n4+1,sizeof(unsigned)))==NULL){
					fprintf(stderr, "Memory allocation error for n4 in ReadData\n.");
					return NULL;
				}
				for(j=0;j<dummy_n4;j++) fgets(str1,MAXLINE,fin);
			}

		}
		strcpy(str4,"");
		strcpy(str5,"");

	/* end of domain information */


		fgets(str1,MAXLINE,fin);
		sscanf(str1,"%s %s",str2,str3);
		if(strcmp(str2,"N1")){
			fprintf(stderr,"Incorrect data format in node %u, item N1.\n",i);
			return NULL;
		}
		a[i]->n1 = (unsigned)atol(str3);
		if((a[i]->sqlist1=(SQlist*)realloc(a[i]->sqlist1,(a[i]->n1+1)*sizeof(SQlist)))==NULL){
			fprintf(stderr, "Memory allocation error in ReadData\n.");
			return NULL;
		}

		for(j=0;j<a[i]->n1;j++){
			b=&a[i]->sqlist1[j];
			fgets(str1,MAXLINE,fin);
			sscanf(str1,"%u %u %u %s %u %u %u",&b->ID, &b->Sstart, &b->Send, str2, &b->qID,\
				&b->Qstart, &b->Qend);
			b->score = atof(str2);
		}

		fgets(str1,MAXLINE,fin);
		sscanf(str1,"%s %s",str2,str3);
		if(strcmp(str2,"N2")){
			fprintf(stderr,"Incorrect data format in node %u, item N2.\n",i);
			return NULL;
		}
		a[i]->n2 = (unsigned)atol(str3);
		if(clique_mode){		/* 352e13 to spare memory */
			for(j=0;j<a[i]->n2;j++){
				fgets(str1,MAXLINE,fin);
			}
			a[i]->n2 = 1;
		} else {
			if((a[i]->sqlist2=(SQlist*)realloc(a[i]->sqlist2,(a[i]->n2+1)*sizeof(SQlist)))==NULL){
				fprintf(stderr, "Memory allocation error in ReadData 2\n.");
				return NULL;
			}

			for(j=0;j<a[i]->n2;j++){
				b=&a[i]->sqlist2[j];
				fgets(str1,MAXLINE,fin);
				sscanf(str1,"%u %u %u %s %u %u %u",&b->ID, &b->Sstart, &b->Send, str2, &b->qID,\
					&b->Qstart, &b->Qend);
				b->score = atof(str2);
			}
		}

		fgets(str1,MAXLINE,fin);	/* empty line */

		PrintProgress(i,10,1000);
	}
	fgets(str1,MAXLINE,fin);
	sscanf(str1,"%s",str2);
	if(strcmp(str2,"END")){
		fprintf(stderr,"Incorrect end of data file. But we continue.\n");
	}
	
	return a;
}
/***************************************************/


/***************************************************/
Node **ReadAnnot2(Node **a,unsigned *nodes_p,FILE *ftable)
/* A new table format is necessary. 
The first line should begin with 
GCLUST_TABLE <TAB> <number>
The annotation should not exceed line limit. */
/*Simplified reading with gclust2 format*/
/* gclust3 format makes further easier. */
/***************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char str3[MAXLINE + 2];
	char str4[MAXLINE + 2];
	char str5[MAXLINE + 2];
	char *p;
	char *q;
	char *line1;
	unsigned ct=0;
	unsigned len=2*MAXLINE;	/* length of line1 */
	unsigned curr_len;
	unsigned i,j,num;
	unsigned begin;
	char gclust_table[20];
	char gclust_table2[20];
	char gclust_table3[20];
	Boolean gclust2=FALSE;

	strcpy(gclust_table,GCLUST_TABLE);
	strcpy(gclust_table2,GCLUST_TABLE2);
	strcpy(gclust_table3,GCLUST_TABLE3);

	if((line1=InitLine(len))==NULL){
		return NULL;
	}
	curr_len=(unsigned int)strlen(line1) + 1;
	ct=0;
	
	if(fgets(str1,MAXLINE,ftable)==NULL){
		fprintf(stderr,"Unable to read table.\n");
		return NULL;
	}
	sscanf(str1,"%s\t%s",str2,str3);
 	if(!strcmp(str2,gclust_table2)){
		gclust2=TRUE;
 	} else if(!strcmp(str2,gclust_table3)){
		gclust3=TRUE;
 	} else if(strcmp(str2,gclust_table)){
		fprintf(stderr,"Incorrect table format.\n");
		return NULL;
	}

	*nodes_p = (unsigned)atol(str3);
	if((a=InitNode(*nodes_p + 1))==NULL){
		exit(1);
	}
	j=0;

	if(gclust3){
		printf("The table file is in the Gclust3 format.\n");
		printf("The sequence file before BLASTP is assumed to be in the Gclust3 format.\n");
		while(fgets(str1,MAXLINE,ftable)!=NULL){
			ct+=1;
			PrintProgress(ct,10,1000);
			j++;
			sscanf(str1,"%u %s %u %s",&num,str2,&len,str3);
			if(num != j) {
				fprintf(stderr,"Number in GCLUST3 table is strange. num=%u, j=%u.\n",num,j);
				exit(1);
			}
			strcpy(a[j]->name,str2);
			a[j]->len=len;

			p=strtok(str1,"\t");
			p=strtok(NULL,"\t");
			p=strtok(NULL,"\t");
			p=strtok(NULL,"\t");
			if(p==NULL) strcpy(str3,"no description");
			else {
				while(*p==' ') p++;
				strncpy(str3,p,MAXLINE);
			}
			if(!strcmp(str3,"")) strcpy(str3,"no description");
			strncpy(a[j]->annot,str3,ANNOTLEN);
			a[j]->annot[ANNOTLEN]='\0';
			for(p=&a[j]->annot[ANNOTLEN];&a[j]->annot[0];p--){
				if(*p==' ' || *p==';' || *p=='\n') *p='\0';
				else if(*p=='\0') continue;
				else break;
			}
			p=a[j]->annot;
			while(*p!='\0'){
				if(*p==' ') *p='_';
				p++;
			}
		}
		*nodes_p = j;
		return a;
	}

	if(gclust2){
		while(fgets(str1,MAXLINE,ftable)!=NULL){
			ct+=1;
			PrintProgress(ct,10,1000);
			j++;
			sscanf(str1,"%s %u %s",str2,&len,str3);
			strcpy(a[j]->name,str2);

			strcpy(str5,str2);
			for(q=str5;*q!='\0';q++){
				if(*q == '_'){
					*q = '\0';
					break;
				}
			}		 /* str5 holds genome name */
			for(i=0;i<max_gi;i++){
				if(i==max_gi) break;
				if(!strcmp(gi[i].name,str5)){
					break;
				}
			}
			if(i==max_gi){
				strcpy(gi[i].name,str5);
				gi[i].begin = j;
				max_gi++;
			}

			a[j]->len=len;
			strcpy(a[j]->annot,str3);
		}
		*nodes_p = j;
		return a;
	}

	while(fgets(str1,MAXLINE,ftable)!=NULL){
		if(!strcmp(line1,"") && !strcmp(str1,"\n")) continue; 
		sscanf(str1,"%s",str2);
		if(!strcmp(line1,"") && !strcmp(str2,"ORF")) {
			continue;
		}

		if(curr_len + strlen(str1) > len){
			if((line1=ReallocLine(line1,curr_len,MAXLINE))==NULL){
				return NULL;
			}
			len=curr_len + MAXLINE;
		}
		strcat(line1,str1);
		curr_len=(unsigned int)strlen(line1) + 1;
		if(strlen(str1) >= MAXLINE - 1 || str1[strlen(str1)-1]!='\n') continue;
		else {
			ct+=1;
			PrintProgress(ct,10,1000);
			if(line1[strlen(line1)-1]=='\n') line1[strlen(line1)-1]='\0';
			p=strtok(line1,"\t");
			strncpy(str2,p,MAXLINE);
			if(str2[strlen(str2)-1] == '_' || strlen(str2)==0){
				fprintf(stderr,"Skipping invalid line %u: %s.\n",ct,str2);
				strcpy(line1,"");
				curr_len=(unsigned int)strlen(line1) + 1;
				continue;
			}
			strcpy(str5,str2);
			for(q=str5;*q!='\0';q++){
				if(*q == '_'){
					*q = '\0';
					break;
				}
			}		 /* str5 holds genome name */
			begin = 1;
			for(i=0;i<max_gi;i++){
				if(i==max_gi) break;
				if(!strcmp(gi[i].name,str5)){
					begin = gi[i].begin;
					break;
				}
			}
			if(i==max_gi){
				strcpy(gi[i].name,str5);
				gi[i].begin = j+1;
				max_gi++;
			}

			if(j>0 && j>begin) {
				for(i=begin;i<=j;i++){
					if(!strcmp(a[i]->name,str2)) break;
				}
				if(i<=j){
					fprintf(stderr,"Name duplication detected: %s in line %u.\n",str2,ct);
					fprintf(stderr,"Skipping this line.\n");
					strcpy(line1,"");
					curr_len=strlen(line1) + 1;
					continue;
				}
			}
			p=strtok(NULL,"\t");
			if(p!=NULL) strncpy(str3,p,MAXLINE);
			else strcpy(str3,"0");
			p=strtok(NULL,"\t");
			if(p==NULL) strcpy(str4,"no description");
			else {
				while(*p==' '){
					p++;
				}
				strncpy(str4,p,MAXLINE);
			}
			if(!strcmp(str4,"")) strcpy(str4,"no description");

			j++;

			if(j==*nodes_p + 1){
				if((a=ReallocNode(a,*nodes_p,1))==NULL){
					exit(1);
				}
				*nodes_p += 1;
			}

			strcpy(a[j]->name,str2);
			a[j]->len=(unsigned int)atol(str3);
			/* check of sequence length is added. 353i */
			if(a[j]->len > _largeprotein) a[j]->domain |= 0x4;
			strncpy(a[j]->annot,str4,ANNOTLEN);
			a[j]->annot[ANNOTLEN]='\0';
			for(p=&a[j]->annot[ANNOTLEN];&a[j]->annot[0];p--){
				if(*p==' ' || *p==';' || *p=='\n') *p='\0';
				else if(*p=='\0') continue;
				else break;
			}

			p=a[j]->annot;
			while(*p!='\0'){
				if(*p==' ') *p='_';
				p++;
			}
			strcpy(line1,"");
			curr_len=strlen(line1) + 1;
		}
		curr_len=strlen(line1) + 1;
	}

	*nodes_p = j;
	return a;
}
/***************************************************/

/***************************************************/
unsigned *MakeTable(Node **a,unsigned n)
/***************************************************/
{
	unsigned i,j;
	unsigned *Table;

	if((Table=(unsigned*)calloc((n+1),sizeof(unsigned)))==NULL){
		fprintf(stderr,"Error in MakeTable.\n");
		exit(1);
	}
	for(i=1;i<=n;i++){
		j=a[i]->n0;
		if(j < 1 || j > n){
			fprintf(stderr,"Error in MakeTable 2.\n");
			return NULL;
		}
		Table[j]=i;
	}

	return Table;
}
/***************************************************/

/***************************************************/
unsigned FindNode(Node **a,unsigned nodes,unsigned i)
/***************************************************/
{
	unsigned j;
	static unsigned n=0;	/* local nodes */
	static unsigned *Table;

	if(a==NULL) return 0;
	if(nodes==0) return 0;
	if(i>nodes) return 0;
	if(i==0) return 0;

	if(n!=nodes){
		if(n > nodes){
			fprintf(stderr,"nodes=%u, n=%u.\n",nodes,n);
			fflush(stderr);
		}
		n=nodes;
		Table=MakeTable(a,n);
	}

	j=Table[i];
	if(i != a[j]->n0){
		Table=MakeTable(a,n);
		j=Table[i];
	}
	if(i != a[j]->n0){
		fprintf(stderr,"Error in FindNode with %u.\n",i);
		return 0;
	}

	return j;
}
/***************************************************/

/***************************************************/
void OptimizeNode(Node **a,unsigned nodes)
/* not used now */
/***************************************************/
{
	unsigned i,j,k;
	unsigned max=0;
	unsigned max4=0;
	unsigned master;
	Node *tmp;

	printf("\nOptimization...\n");

	for(i=1;i<=nodes;i++){
		PrintProgress(i,10,1000);
		if(a[i]->n3==0) continue;
		if(a[i]->final_thr < 1.0) continue;
		max=a[i]->n30;
		max4=a[i]->n4;
		if(max==0) continue;
		master=0;
		for(j=0;j<a[i]->n4;j++){
			k=FindNode(a,nodes,a[i]->idlist4[j]);
			if(k==0) continue;
			if(a[k]->final_thr < 1.0) continue;
			if(max < a[k]->n30 ){
				max=a[k]->n30;
				master=k;
			}
		}	
		if(master==0 || master==i) continue;

		if(MatchIDlist(a,nodes,i,master)){
			if(a[i]->n4 > a[master]->n4){
				continue;
			}
			printf("IDlist4 for nodes %u and %u are not equal. %s\n",i,master,a[master]->name);
		}

		for(j=1;a[master]->sqlist3[j].ID!=0;j++){;}
		a[master]->n3=j;
		a[i]->n3 = 0;

		tmp = a[i];
		a[i] = a[master];
		a[master] = tmp;

		printf("Exchanged %u and %u.\n",i,master);

	}
	printf("\n");
	printf("   Optimization complete.\n");
	fflush(stdout);
	return;
}
/***************************************************/

/***************************************************/
void CheckNode(Node **a,unsigned nodes)
/***************************************************/
{
	unsigned i1, i2, j1, j2;
	unsigned num;
	unsigned N0,N1,N2;
	unsigned total=0;
	Boolean found=FALSE;

	printf("\nChecking nodes...\n");

	if(clique_mode) printf("List of duplicated nodes to be deleted and to be revived: \n");

	for(i1=1;i1<=nodes;i1++){		/* corrected in 308 */
		PrintProgress(i1,10,1000);
/*		if(i1 % 1000 == 0) printf("\n");
 */
		N0=a[i1]->n0;	/* bug fix in 353b */
		N1=a[i1]->n4;
		if(!a[i1]->active){
			ch[N0] = TRUE;
			total += 1;
			continue;
		}
		if(a[i1]->n3_bak != 0 || a[i1]->n3 != 0){
			total += N1;
		}
		if(a[i1]->n3 == 0) {
			if(a[i1]->domain & 0x01 || a[i1]->domain & 0x04) {
				ch[N0] = TRUE;
				total += 1;
			}
			continue;
		}

		for(j1=0;j1<N1;j1++){
			N2=a[i1]->idlist4[j1];
			if(N2 == 0 || N2 > nodes) continue;
			if(ch[N2]){
				num=FindNode(a,nodes,N2);
				if(num==0) continue;
				if(clique_mode){
					if(a[i1]->n4 == 1){
						printf(" -%u- ",a[i1]->n0);
						a[i1]->n3b = a[i1]->n3;
						a[i1]->n3 = 0;
						a[i1]->n4 = 0;
						total -= 1;
					} else {
						printf("Remaining duplication: %u ",i1);
					}
				} else {
					printf("Duplication detected: node %u-%u, item=%s.\n",\
						a[i1]->n0,j1,a[num]->name);
				}

			} else ch[N2]=TRUE;
		}	
	}
	if(clique_mode) printf("\n");

	if(clique_mode){
		for(i1=1;i1<=nodes;i1++){
			N0 = a[i1]->n0;
			if(ch[N0] != 0) continue;
			if(a[i1]->domain & 0x05) continue;
			if(a[i1]->n4 == 0 && a[i1]->n3 == 0 && a[i1]->n3b != 0){
				a[i1]->n4 = 1;
				a[i1]->n3 = a[i1]->n3b;
				a[i1]->pID = a[i1]->n0;		/* 355 */
				printf(" +%s ", a[i1]->name);
/*				printf("Node revived: i1=%u, n0=%u, n3=%u, n4=%u, domain=%u, %s.\n",\
					i1,N0,a[i1]->n3,a[i1]->n4,a[i1]->domain,a[i1]->name);
*/
			}
		}

	} else {
		if(total!=nodes){
			printf("\nTotal number = %u, while total nodes should be %u.\n",total,nodes);
			printf("Trying to recover hidden nodes.\n");
			for(i1=1;i1<nodes;i1++){
				PrintProgress(i1,10,1000);
				if(a[i1]->n3 != 0) continue;
				if(a[i1]->final_thr < 1.0 || (a[i1]->domain != 0)) continue;
				N1=a[i1]->n4;
				if(N1==1){
					found=FALSE;
					for(i2=1;i2<=nodes;i2++){
						if(a[i2]->n3 == 0) continue;
						N2=a[i2]->n4;
						for(j2=0;j2<N2;j2++){
							if(a[i1]->n0== a[i2]->idlist4[j2]) {
								found=TRUE;
								break;
							}
						}
					}
					if(found==FALSE){
						printf("Recovering node %u.\n",a[i1]->n0);
						a[i1]->n3=1;
						total += 1;
					}	
				}
			}
			printf("Total number = %u, total nodes = %u.\n",total, nodes);			
		}
	}

	printf("\n   check complete.\n");
	fflush(stdout);
	return;
}
/***************************************************/

/***************************************************************/
Node **AssignSingletons(Node **a,unsigned nodes)

/* 350c
 * This subroutine is used to re-assign singletons to some 
 * existing nodes. Information in matchlist7 is used.
 * The procedure is:
 * 0. matchlist7 is ordered according to score value.
 * 1. Search for items in matchlist7.
 * 2. If domain == 0,
 *   A. For each member of matchlist7, check if
 *    (1) it is a parent node
 *    (2) overlapscore and domain sharing is above a threshold
 *    (3) overlapscore is also above a threshold for 
 *        many members of the node (above a threshold)
 *   B. Select the best node to assign this singleton.
 * 3. If domain == 1 (multidomain),
 *   Essentially identical procedure using matchlist7 as above,
 *   In version 355, matchlist7 is made for multidomain and fragment
 *   in BinaryInteractions. 
 * 4. If domain == 2 (fragment), follow procedure 2, but instead of 
 *   using a threshold, the parent giving the highest socre is found.
 * 5. If domain == 4 (large protein),
 *   follow procedure 3, but domain ID cannot be used because
 *   it has not been defined. The region is to be listed in the 
 *   table.
 *
 *   This subroutine was unfinished until version 354.
 *   Version 355: Codes were re-edited. July 2007.
 *   Version 355o: Codes totally rewritten. Aug. 2007.
 */
/***************************************************************/
{
	unsigned i,j,jj,k,m,n;
	unsigned n4,n7,n4m;
	unsigned j1,ID,pID;
	double level1 = 0.9 * _level1;	/* threshold for overlap */
	unsigned target;
	Boolean found_target = FALSE;
	unsigned bits;
	unsigned long domain;
	/* items from MergeDistantRelatives */
	unsigned *p_list;	/* first item is pID, second item is occurrence */
	unsigned max_p;
	unsigned max_n7,count,max_count,max_pID;
	
	if(!clique_mode) return a;

	printf("\nLooking for singletons and trying to assign them to some nodes ...\n");

	/* Initialization of p_list */	
	max_n7 = 0;
	for(i=1;i<=nodes;i++){
		if(a[i]->n3 == 0 || a[i]->n4 == 0) continue;
		if(a[i]->domain != 0) continue;	
		if(max_n7 < a[i]->n7) max_n7 = a[i]->n7;
	}

	if((p_list = (unsigned*)(calloc(2 * max_n7 + 2,sizeof(unsigned))))==NULL){
		fprintf(stderr,"Memory allocation error in AssignSingletons 0.\n");
		exit(1);
	}
	
	for(n=0;n<max_n7;n++){
		p_list[2 * n] = 0;
		p_list[2 * n + 1] = 0;
	}
	max_p = 0;

	/* Loop */
	for(i=1;i<=nodes;i++){
		PrintProgress(i,10,1000);
		if((n4=a[i]->n4) > 1 || n4 == 0) continue;	/* only n4=1 */
		if((n7=a[i]->n7) == 0) continue;
		if(n7==1) continue;

		if((domain=a[i]->domain) & 0x04) continue;

		found_target = FALSE;

		for(n=0;n<max_p;n++){
			p_list[2 * n] = 0;
			p_list[2 * n + 1] = 0;
		}
		max_p = 0;
		for(j=0;j<n7;j++){
			jj = a[i]->matchlist7[j].ID;
			if(jj==a[i]->n0) continue;
			m = FindNode(a,nodes,jj);
			if((pID = a[m]->pID) == 0) continue;
			if(a[m]->domain & 0x04) continue;

			for(n=0;n<max_p;n++){
				if(p_list[2 * n] == pID){
					p_list[2 * n + 1] += 1;
					break;
				}
			}
			if(n==max_p){
				p_list[2 * n] = pID;
				p_list[2 * n + 1] = 1;
				max_p += 1;
			}
		}

		max_pID = 0;
		max_count = 0;
		for(n=0;n<max_p;n++){
			if((count = p_list[2 * n + 1]) > max_count){
				max_count = count;
				max_pID = p_list[2 * n];
			}
		}

		if((target = FindNode(a,nodes,max_pID)) == 0 || target == i) continue;
		if(a[target]->domain & 0x04) continue;
		if(max_count * 2 + 1>= n7){
			if(n7 == 1) found_target = 1;
			else found_target = 2;
		}else if(n7 > number_of_genomes && max_count * 3 > n7){
			found_target = 3;
		}

		if(found_target){
			a[i]->n3b = a[i]->n3;
			a[i]->n3 = 0;
			n4m = a[target]->n4;
			if((a[target]->idlist4 = (unsigned*)realloc(a[target]->idlist4,(n4m +2)*sizeof(unsigned)))==NULL){
				fprintf(stderr,"Memory allocation error in AssignSingletons1.\n");
				exit(1);
			}
			a[target]->idlist4[n4m++] = a[i]->n0;
			a[target]->n4 = n4m;
			a[i]->pID = a[target]->n0;
			if(a[i]->domain & 0x01) printf("Multidomain node ");
			else printf("Node ");
			printf("%u %s has been added to node %u %s. signal %d.\n",i,a[i]->name,target,a[target]->name, found_target);
		}

	}

	return a;
}
/***************************************************************/

/***************************************************************/
Node **ReadNode2(FILE *fin,Node **a,unsigned *nodes_p)
/***************************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char str5[MAXLINE + 2];
	char *line1;
	char *q;
	unsigned ct=0;
	unsigned len=10*MAXLINE;	/* length of line1 */
	unsigned curr_len;
	unsigned i;
	unsigned begin;

	if((line1=InitLine(len))==NULL){
		exit(1);
	}
	curr_len=strlen(line1) + 1;
	InitClist(*nodes_p);

	while(fgets(str1,MAXLINE,fin)!=NULL){
		if(!strcmp(line1,"") && !strcmp(str1,"\n")) continue; 
		if(strstr(str1,"BLAST")!=NULL) continue;
		sscanf(str1,"%s",str2);
		if(!strcmp(str2,"input")) continue;

		if(curr_len + strlen(str1) > len){
			if((line1=ReallocLine(line1,curr_len,MAXLINE))==NULL){
				exit(1);
			}
			len=curr_len + MAXLINE;
		}
		strcat(line1,str1);
		curr_len=strlen(line1) + 1;
		if(strlen(str1) >= MAXLINE - 1 || str1[strlen(str1)-1]!='\n') continue;
		else {
			ct+=1;
			PrintProgress(ct,10,1000);
			sscanf(line1,"%s",str2);

			if(gclust3){
				begin=1;
				if(*str2 == 'g'){
					i = atol(str2+1);
				} else {
					fprintf(stderr,"Sequence file is not in Gclust3 format.\n");
					exit(1);
				}
			} else {
				strcpy(str5,str2);
				for(q=str5;*q!='\0';q++){
					if(*q == '_'){
						*q = '\0';
						break;
					}
				}       /* str5 holds genome name */
				begin = 0;
				for(i=0;i<max_gi;i++){
					if(!strcmp(gi[i].name,str5)){
						begin = gi[i].begin;
						break;
					}
				}
				if(i<max_gi){
					for(i=begin;i<=*nodes_p;i++){
						if(!strcmp(str2,a[i]->name)) break;
					}
				}
			}
			if(begin == 0 || i>=*nodes_p + 1){
				fprintf(stderr,"\nThe name %s is not found in the table file.\n",str2);
				fprintf(stderr,"%s: ",str1);
				fprintf(stderr,"Skipping this item.\n");
			} else {
				if((a=WriteNode(a,i,nodes_p,line1))==NULL){
					exit(1);
				}
				ch[i] = TRUE;
			}
			strcpy(line1,"");
			curr_len=strlen(line1) + 1;
		}
	}
/* check of entries 353i */
	for(i=1;i<=*nodes_p;i++){
		if(!ch[i]){
			fprintf(stderr,"Entry %u is missing in input file.\n",i);
		}
	}
	InitClist(*nodes_p);

	return a;
}
/***************************************************************/

/***************************************************************/
Node **ReadNodeM8(FILE *fin,Node **a,unsigned *nodes_p)
/* This is used to read the -m8 table directly.
Version 354. May 29, 2007  */
/***************************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char str5[MAXLINE + 2];
	unsigned i,k;
	unsigned current_i;
	long fileloc;		/* pointer to the current location within the file */
	char item[MAXLINE];		/* gxxxxx number */
	unsigned counter; 
	unsigned new_n1;
	unsigned current_n1;


	InitClist(*nodes_p);

	while(fgets(str1,MAXLINE,fin)!=NULL){
		if(!strcmp(str1,"\n")) continue; 

		sscanf(str1,"%s",str2);
		if(*str2 == 'g'){
			i = atol(str2+1);
			if(current_i != i) {
				PrintProgress(i,10,1000);
				strcpy(item,str2);
				current_i = i;
				counter = 1;
				fileloc = ftell(fin);
			/* enough memory is allocated to sqlist1 */
				strcpy(str5,"");
				while(fgets(str5,MAXLINE,fin)!=NULL){
					if(!strcmp(str5,"\n")) continue; 
					sscanf(str5,"%s",str2);
					if(strcmp(item,str2)) break;
					counter += 1;
				}
				if(fseek(fin,fileloc,SEEK_SET)){
					fprintf(stderr,"File seek error.\n");
				}
				current_n1 = a[i]->n1;
				new_n1 = current_n1 + counter;
/*
		printf("i=%u, counter=%u. current_n1=%u, new_n1=%u.\n",i,counter,current_n1, new_n1);
*/
				if((a[i]->sqlist1=(SQlist*)realloc(a[i]->sqlist1,(new_n1 + 1)*sizeof(SQlist)))==NULL){
					fprintf(stderr, "Memory allocation error in ReadNodeM8\n.");
					return NULL;
				}
				for(k=current_n1+1;k<=new_n1;k++){
					clearSQlist(&a[i]->sqlist1[k]);
				}
				a[i]->sqlist1[0].ID = i;
				a[i]->sqlist1[0].qID = i;
				a[i]->sqlist1[0].Sstart = 1;
				a[i]->sqlist1[0].Qstart = 1;
				a[i]->sqlist1[0].Send = a[i]->len;
				a[i]->sqlist1[0].Qend = a[i]->len;
				if(a[i]->len > _largeprotein){
					a[i]->domain |= 0x4;
					printf("Node %u: %s is being set as a large protein. Size=%u\n",i,a[i]->name,a[i]->len);
				}
				a[i]->n1 = 0;
			}

		} else {
			fprintf(stderr,"Sequence file is not in Gclust3 format.\n");
			exit(1);
		}

		if((a=WriteNodeM8(a,i,nodes_p,str1))==NULL){
			exit(1);
		}
		ch[i] = TRUE;
		strcpy(str1,"");
	}


/* check of entries 353i */
	for(i=1;i<=*nodes_p;i++){
		if(!ch[i]){
			fprintf(stderr,"Entry %u is missing in input file, but we try to recover.\n",i);
			/* For the entries, which have no homologous sequences, even with themselves. */
			if(a[i]->n1 == 0) {
				a[i]->sqlist1[0].ID = i;
				a[i]->sqlist1[0].qID = i;
				a[i]->sqlist1[0].Sstart = 1;
				a[i]->sqlist1[0].Qstart = 1;
				a[i]->sqlist1[0].Send = a[i]->len;
				a[i]->sqlist1[0].Qend = a[i]->len;
				a[i]->sqlist1[0].score = thr_list[num_thr];
				a[i]->sqlist1[0].domain = 0;
				a[i]->sqlist1[0].crosslink = FALSE;
				a[i]->n1 = 1;
				ch[i] = TRUE;
			}
		}
	}
	InitClist(*nodes_p);

	return a;
}
/***************************************************************/

/**************************************************************/
Node **WriteNodeM8(Node **a,unsigned i,unsigned *nodes_p,char *line1)
/* This is used to read the -m8 table directly. */
/**************************************************************/
{
	char str1[MAXLEN];
	char *p;
	unsigned curr_n1=0;
	double tmp=0.0;
	Boolean to_skip=FALSE;
	Boolean addition_mode=FALSE;	/* The first item has been set */

	if(i>*nodes_p){
		fprintf(stderr, "Node member must be less than nodes in WriteNode.\n");
		return NULL;
	}
	if(a[i]->sqlist1[0].ID == 0) addition_mode=TRUE;

	sscanf(line1,"%s",str1);
	if(!strcmp(a[i]->name,"")){
		fprintf(stderr, "Empty node name in WriteNodeM8.");
		fprintf(stderr, "i= %u.\n",i);
		fprintf(stderr, "line1= %s.\n",line1);
		return NULL;
	}

	curr_n1 = a[i]->n1;
	a[i]->n1 += 1;

	p=strtok(line1,"\t");	/* Query ID */
	a[i]->sqlist1[curr_n1].qID = i;

	p=strtok(NULL,"\t");	/* Subject ID */
	if(*p == 'g'){
		a[i]->sqlist1[curr_n1].ID = atol(p+1);
	}else{
		fprintf(stderr,"Error in subject ID.\n");
		return NULL;
	}

	p=strtok(NULL,"\t");	/* item 2 */
	p=strtok(NULL,"\t");	/* item 3 */
	p=strtok(NULL,"\t");	/* item 4 */
	p=strtok(NULL,"\t");	/* item 5 */

	p=strtok(NULL,"\t");	/* Q start */
	a[i]->sqlist1[curr_n1].Qstart = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* Q end */
	a[i]->sqlist1[curr_n1].Qend = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* S start */
	a[i]->sqlist1[curr_n1].Sstart = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* S end */
	a[i]->sqlist1[curr_n1].Send = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* E-value */
	tmp = atof(p);
	if(tmp >= thr * 0.999) to_skip=TRUE;
	else a[i]->sqlist1[curr_n1].score = tmp;

	return a;
}
/**************************************************************/

/*    end of file node.c    */
