/*******************************************
matrix.c
GCLUST
This program reads a list file (generated from BLAST result by bl2ls.pl),
and assembles similarity groups. Output is a *.grp file.
This is a C version of the lsort2b.pl.
  Copyright Naoki Sato 2002.

Selection of master entry. June 25, 2002.
Added hom. July 2, 2002.
Added Unit. July 3, 2002.
Modified input format. July 4, 2002.
Matrix is now float. July 5, 2002.
Matrix is now int. July 6, 2002.
Added sublist5. July 7, 2002.
Added sublist6. sublist5 changed to reflist5. July 10, 2002.
Phase 1. July 19, 2002.
R,R1,R2 are made global. Aug. 9, 2002.
u is now global. Sept. 8, 2002.
R is now simple pointer. Sept. 9, 2002.
Bug fix for matrix. Sept. 10, 2002.
Bug fix for Unit matrix. Sept. 30, 2002.
Added mode homsub. Oct. 18, 2002.
Reconstruction. April 30, 2003.
*******************************************/

#include "defines.h"

int *InitMatrix(unsigned d);
int *ReallocMatrix(int *m,unsigned d);
void ClearMatrix(int *matrix,unsigned d);
void WriteMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d);
void XorMatrix(Node **a,unsigned nodes,unsigned i,int *m,int *n,unsigned d);
void FreeMatrix(int *m);
Unit **InitUnitMatrix(unsigned i);
void ReallocUnitMatrix(unsigned i1,unsigned i2);
void ClearUnitMatrix(unsigned i);
void FreeUnitMatrix(void);
void WriteUnitMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d,char *out_mode);
void WriteSubMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d);
void WriteSubUnitMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d,char *out_mode);
void EvaluateMatrix(Node **a,unsigned nodes,unsigned k,int *m,unsigned max);
unsigned RemoveUnrelated(Node **a,unsigned nodes,unsigned k,int *m,unsigned max);
void DiagonalMatrix(Node **a,unsigned nodes,unsigned k,int *m,unsigned max,int mode);

#ifndef __MATRIX__
#define __MATRIX__
#endif

#include "gclust.h"


/***************************************************/
int *InitMatrix(unsigned d)
/***************************************************/
{
	int *m;
	unsigned size = d*d+1;

	printf("Size of new matrix m: %u, dimension: %u.\n",size,d);

	if((m=(int*)calloc(size,sizeof(int)))==NULL){
		fprintf(stderr,"Error in InitMatrix1.\n");
		fprintf(stderr,"dimension=%u\n",d);
		return NULL;
	}
	return m;
}
/***************************************************/

/***************************************************/
int *ReallocMatrix(int *m,unsigned d)
/***************************************************/
{
	unsigned size = d*d+1;

	printf("Size of reallocated matrix m: %u, dimension: %u.\n",size,d);

	if((m=(int*)realloc(m,size*sizeof(int)))==NULL){
		fprintf(stderr,"Error in ReallocMatrix1.\n");
		return NULL;
	}
	return m;
}
/***************************************************/

/***************************************************/
void ClearMatrix(int *m,unsigned d)
/***************************************************/
{
	unsigned i;
	unsigned size=d*d+1;
	
	if(d>1000) printf("Size of cleared large matrix m: %u, dimension: %u.\n",size,d);
	
	for(i=0;i<size;i++){
		m[i]=0;
	}
	return;
}
/***************************************************/

/*************************************************************************/
void XorMatrix(Node **a,unsigned nodes,unsigned i,int *m,int *n,unsigned d)
/* matrix m and n are initialized in main.c */
/*************************************************************************/
{
	unsigned j1,j2,k;
	unsigned n4;
	int score;
	unsigned pos1,pos2,pos3;
	int threshold,maximum;

	n4 = a[i]->n4;
	threshold = n4 * 0.3;

	if(n4 < number_of_genomes || n4 < 50) return;

	for(j1=0;j1<n4;j1++){
		for(j2=0;j2<n4;j2++){
			score = 0;
			for(k=0;k<n4;k++){
				pos1=j1 * d + k;
				pos2=k * d + j2;
				score += m[pos1] ^ m[pos2];
			}
			pos3 = j1 * d + j2;
			n[pos3] = score;
		}
	}

	PrintMatrix(stdout,a,nodes,i,m,d,"1");
	PrintMatrix(stdout,a,nodes,i,n,d,"1");


	/* The first part of m is used for further calculation */

	for(pos3=0;pos3<n4;pos3++) m[pos3] = 0;

	for(j1=0;j1<n4;j1++){
		score = 0;
		for(k=0;k<n4;k++){
			pos3 = j1 * d + k;
			if(n[pos3] >= threshold) score += 1;
			pos3 = k * d + j1;
			if(n[pos3] >= threshold) score += 1;
		}
		m[j1] = score;
	}

	maximum = 0;
	for(j1=0;j1<n4;j1++){
		if(m[j1] > maximum) {
			maximum = m[j1];
		}
	}
	
	if(maximum <= 2 * threshold) return;

printf("Node %d %s\tN4=%u, maximum=%d\n",i,a[i]->name,n4,maximum);

	for(j1=0;j1<n4;j1++){
		if(m[j1]==maximum){
			k=FindNode(a,nodes,a[i]->idlist4[j1]);
			if(a[k]->domain != 0) continue;
			a[k]->domain = 1;
			printf("Node %d %s is being set as a multidomain protein.\n",k,a[k]->name);
			fflush(stdout);
		}
	}


	return;
}
/*************************************************************************/


/*************************************************************************/
void WriteMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d)
/*************************************************************************/
{
	unsigned j1,j2,k1,k2;
	unsigned n4;
	int score;
	unsigned pos;

	n4 = a[i]->n4;
	for(j1=0;j1<n4;j1++){
		k1=FindNode(a,nodes,a[i]->idlist4[j1]);
		if(k1==0) continue;
		if(a[k1]->grpno == 0) a[k1]->grpno = grpno;
		for(j2=0;j2<n4;j2++){
			k2=FindNode(a,nodes,a[i]->idlist4[j2]);
			if(k2==0) continue;
			score=GetScore(a,nodes,k1,k2);
			pos=j1 * d + j2;
			m[pos]=score;
		}
	}
	return;
}
/*************************************************************************/

/*************************************************************************/
void WriteSubMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d)
/*************************************************************************/
{
	unsigned j1,j2,k1,k2;
	unsigned n6;
	int score;
	unsigned pos;

	if(i==0 || i>nodes) return;
	n6 = a[i]->n6;
	if(n6==0) return;
	for(j1=0;j1<n6;j1++){
		k1=FindNode(a,nodes,a[i]->sublist6[j1]);
		if(k1==0) continue;
		if(a[k1]->subgrpno==0) a[k1]->subgrpno = subgrpno;
		for(j2=0;j2<n6;j2++){
			k2=FindNode(a,nodes,a[i]->sublist6[j2]);
			if(k2==0) continue;
			score=GetScore(a,nodes,k1,k2);
			pos=j1 * d + j2;
			m[pos] = score;
		}
	}
	return;
}
/*************************************************************************/


/**********************************************/
void FreeMatrix(int *m)
/* currently not used. */
/**********************************************/
{
	if(m==NULL) return;
	free(m);
	return;
}
/**********************************************/


/***************************************************************/
Unit **InitUnitMatrix(unsigned i)
/***************************************************************/
{
	Unit **u1;
	unsigned j;
	unsigned size;

	size  = i * i + 1;

	printf("Size of new matrix u: %u, dimension: %u.\n",size,i);

	if((u1=(Unit**)calloc(1,sizeof(Unit)))==NULL){
		fprintf(stderr,"Error in InitUnitMatrix1.\n");
		exit(1);
	}
	if((*u1=(Unit*)calloc(size,sizeof(Unit)))==NULL){
		fprintf(stderr,"Error in InitUnitMatrix1.\n");
		exit(1);
	}
	for(j=0;j<size;j++){
		if(((*u1)[j].s = (SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr,"Error in InitUnitMatrix4.\n");
			exit(1);
		}
		(*u1)[j].n = 0;
	}
	
	return u1;
}
/***************************************************************/

/***************************************************************/
void ReallocUnitMatrix(unsigned i1,unsigned i2)
/* i1 is old value, i2 is new value                            */
/***************************************************************/
{
	unsigned j;
	unsigned old_size;
	unsigned new_size;

	old_size = i1 * i1 + 1;
	new_size  = i2 * i2 + 1;

	printf("Current size of u: %u, dimension: %u.\n",old_size,i1);
	printf("    New size of u: %u, dimension: %u.\n",new_size,i2);

	if(i1 >= i2) return;


	if((*u=(Unit*)realloc(*u,new_size*sizeof(Unit)))==NULL){
		fprintf(stderr,"Error in ReallocUnitMatrix1.\n");
		exit(1);
	}

	for(j=old_size;j<new_size;j++){
		if(((*u)[j].s = (SQlist*)calloc(1,sizeof(SQlist)))==NULL){
			fprintf(stderr,"Error in ReallocUnitMatrix4.\n");
			exit(1);
		}
		(*u)[j].n = 0;
	}
	
	return;
}
/***************************************************************/

/***************************************************************/
void ClearUnitMatrix(unsigned i)
/***************************************************************/
{
	unsigned j,k;
	unsigned size;
	unsigned n;

	size = i * i + 1;
	for(j=0;j<size;j++){
		n = (*u)[j].n;
		if(n==0) continue;
		for(k=0;k<n;k++){
			clearSQlist(&((*u)[j].s)[k]);
		}
		(*u)[j].n = 0;
	}
	
	return;
}
/***************************************************************/

/****************************************************************************************************/
void WriteSubUnitMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d,char *out_mode)
/****************************************************************************************************/
{
	unsigned j1,j2,k1,k2;
	unsigned L1,t1,t2;
	unsigned n6,n3,pID,p;
	double score;
	double value;
	unsigned pos;
	SQlist *tmp;
	unsigned max_mat_size = mat_size*mat_size+1;

	if(strcmp(out_mode,"r") && strcmp(out_mode,"s")){
		fprintf(stderr,"Incorrect out_mode in WriteSubUnitMatrix.\n");
		return;
	}
	if(!strcmp(out_mode,"r")){
		if(a[i]->n3 > 0) n3 = a[i]->n3;
		else n3 = a[i]->n3b;
		pID = a[i]->pID;
		p = FindNode(a,nodes,pID);
		if(p!=0){
			if(n3 < a[p]->n3) n3 = a[p]->n3;
		}
		ClearRegion(R,n3);
	}
	n6 = a[i]->n6;
	for(j1=0;j1<n6;j1++){
		k1=FindNode(a,nodes,a[i]->sublist6[j1]);
		if(k1==0) continue;
		if(a[k1]->subgrpno==0) a[k1]->subgrpno = subgrpno;
		for(j2=0;j2<n6;j2++){
			t1=0;
			t2=0;
			pos=j1 * d + j2;
			if(pos>max_mat_size) {
				fprintf(stderr,"Overflow in Unit matrix in WriteSubUnitMatrix.\n");
				fprintf(stderr,"Skipping loop. The result might not be correct.\n");
				fflush(stderr);
				break;
			}
			k2=FindNode(a,nodes,a[i]->sublist6[j2]);
			if(k2==0) continue;
			for(L1=0;L1<a[k1]->n1;L1++){
				if(a[k1]->sqlist1[L1].ID == a[i]->sublist6[j2]) t2++;
			}
			if(t2>1){
					if((uu.s=(SQlist*)realloc(uu.s,(t2+1)*sizeof(SQlist)))==NULL){
					fprintf(stderr,"Error in WriteSubUnitMatrix1.\n");
					exit(1);
				}
				for(t1=0;t1<t2;t1++){
					clearSQlist(uu.s+t1);
				}
			}
			uu.n = t2;
			t1=0;
			for(L1=0;L1<a[k1]->n1;L1++){
				if(a[k1]->sqlist1[L1].ID == a[i]->sublist6[j2]){
					if(t1>=t2) {
						fprintf(stderr,"t1 >= t2 in WriteSubUnitMatrix2.\n");
						break;
					}
					copySQlist(uu.s+(t1++),&a[k1]->sqlist1[L1]);
				}
			}

			if(t2==0) continue;
			if(*out_mode=='r'){
				value = OverlapScore2(uu.s,t2,a[k1]->len,a[k2]->len);
				m[pos] = (int)(1000.0 * value);
			} else {
				if(t2 > 1){
					score = 1.0;
					for(L1=0;L1<t2;L1++){
						tmp = uu.s;
						if(score > (tmp+L1)->score) score = (tmp+L1)->score;
					}
					if(score<1e-100) m[pos]=10000;
					else m[pos] = (int)(- 100.0 * log10(score));
				} else {
					tmp = uu.s;
					if(tmp->score<1e-100) m[pos]=10000;
					else m[pos] = (int)(- 100.0 * log10(tmp->score));
				}
			}
		}
	}

	return;
}
/*************************************************************************/

/****************************************************************************************************/
void WriteUnitMatrix(Node **a,unsigned nodes,unsigned i,int *m,unsigned d,char *out_mode)
/****************************************************************************************************/
{
	unsigned j1,j2,k1,k2;
	unsigned L1,t1,t2;
	unsigned n4;
	double score;
	double value;
	unsigned pos;
	unsigned n3;
	SQlist *tmp;
	unsigned max_mat_size = mat_size*mat_size+1;
	double var1,var2;

	if(strcmp(out_mode,"r") && strcmp(out_mode,"s")){
		fprintf(stderr,"Incorrect out_mode in WriteUnitMatrix.\n");
		return;
	}
	if(!strcmp(out_mode,"r")){
		n3 = a[i]->n3;
		ClearRegion(R,n3);
	}
	n4 = a[i]->n4;
	for(j1=0;j1<n4;j1++){
		k1=FindNode(a,nodes,a[i]->idlist4[j1]);
		if(k1==0) continue;
		if(a[k1]->grpno == 0) a[k1]->grpno = grpno;
		for(j2=0;j2<n4;j2++){
			t1=0;
			t2=0;
			pos=j1 * d + j2;
			if(pos>max_mat_size) {
				fprintf(stderr,"Overflow in Unit matrix in WriteUnitMatrix.\n");
				fprintf(stderr,"Skipping loop. The result might not be correct.\n");
				fflush(stderr);
				break;
			}
			k2=FindNode(a,nodes,a[i]->idlist4[j2]);
			if(k2==0) continue;
			for(L1=0;L1<a[k1]->n1;L1++){
				if(a[k1]->sqlist1[L1].ID == a[i]->idlist4[j2]) t2++;
			}
			if(t2>1){
				if((uu.s=(SQlist*)realloc(uu.s,t2*sizeof(SQlist)))==NULL){
					fprintf(stderr,"Error in WriteUnitMatrix1.\n");
					exit(1);
				}
				tmp = uu.s;
				for(L1=0;L1<t2;L1++){
					clearSQlist(tmp+L1);
				}
			}
			uu.n = t2;
			for(L1=0;L1<a[k1]->n1;L1++){
				if(a[k1]->sqlist1[L1].ID == a[i]->idlist4[j2]){
					tmp = uu.s;
					copySQlist(tmp + (t1++),&a[k1]->sqlist1[L1]);
				}
			}

			if(t2==0) continue;

			if(*out_mode=='r'){
				value = OverlapScore2(uu.s,t2,a[k1]->len,a[k2]->len);
				m[pos] = (int)(1000.0 * value);
			} else {
				if(t2 > 1){
					score = 1.0;
					for(L1=0;L1<t2;L1++){
						tmp = uu.s;
						if(score > (tmp+L1)->score) score = (tmp+L1)->score;
					}
					if(score < 1e-200) m[pos]=20001;
				/* new format of score: 355z */
					else {
						var1 = ceil(-log10(score));
						var2 = score * pow(10.0, var1);
						m[pos] = 100*(int)var1 + (int)var2;
					}
/*
					else m[pos] = (int)(- 100.0 * log10(score));
*/
					} else {
					tmp = uu.s;
					if(tmp->score < 1e-200) m[pos]=20001;
/*					else m[pos] = (int)(- 100.0 * log10(tmp->score));
 */
					else {
						var1 = ceil(-log10(tmp->score));
						var2 = tmp->score * pow(10.0, var1);
						m[pos] = 100*(int)var1 + (int)var2;
					}
				}
			}
		}
	}

	return;
}
/*************************************************************************/

/**********************************************/
void FreeUnitMatrix(void)
/* currently not used. */
/**********************************************/
{
	if(u==NULL) return;
	if(*u==NULL) return;
	free(*u);
	free(u);
	return;
}
/**********************************************/


/***************************************************************/
void EvaluateMatrix(Node **a,unsigned nodes,unsigned k,int *m,unsigned max)
/***************************************************************/
{
	unsigned i,j,n4,i2,n6;
	unsigned pos;
	unsigned count=0;
	double filled;
	unsigned pID;
	unsigned p;

	n6=a[k]->n6;
	if(n6 == 0) return;
	if(n6 > max) return;

	for(i=0;i<n6;i++){
		for(j=0;j<n6;j++){
			pos = i*max + j;
			count += m[pos]/1000;
		}
	}
	filled = (double)count / (double)(n6 * n6);
	if(filled < _min_filled) {
		pID = a[k]->pID;
		p = FindNode(a,nodes,pID);
		if(p==0) return;
		if(a[p]->n3 == 0){
			fprintf(stderr,"Error in finding parent in EvaluateMatrix.\n");
			return;
		}
		n4 = a[p]->n4;
		for(i=0;i<n6;i++){
			j=FindNode(a,nodes,a[k]->sublist6[i]);
			if(j==k || j==0) continue;
			a[j]->n6 = 1;
			a[j]->sublist6[0] = a[j]->n0;
			a[k]->sublist6[i] = 0;

			for(i2=0;i2<n4;i2++){
				if(a[p]->idlist4[i2] == a[j]->n0 && a[p]->reflist5[i2] == 0){
					a[p]->reflist5[i2] = a[j]->n0;
					break;
				}
			}

		}
		a[k]->n6 = 1;
		a[k]->sublist6[0] = a[k]->n0;
	}

	return;
}
/***************************************************************/

/***************************************************************/
unsigned RemoveUnrelated(Node **a,unsigned nodes,unsigned k,int *m,unsigned max)
/* remove items with similarity to small number of sequences. 
Two conditions: 1. rate of 1 per line is less than MIN_FILLED2.
                2. a single line contains more than a certain limit
                   of zeroes. This limit is determined by two 
                   factors. One is total number of 0 in a matrix.
                   Another is occupancy of 0 in a line per whole matrix.
                   These two factors are determined by environmental
                   variables, MIN_ZEROS and MIN_0_OCCUP, respectively.
Now this function returns the number of items removed.*/
/***************************************************************/
{
	unsigned i,j,n4,i2,n6;
	unsigned pos;
	unsigned count=0;
	double filled;
	unsigned pID;
	unsigned p;
	unsigned total0,line0;
	unsigned removed=0;

	n6=a[k]->n6;
	if(n6 == 0) return 0;
	if(n6 > max) return 0;

	for(i=0;i<n6;i++){
		for(j=0;j<n6;j++){
			pos = i*max + j;
			count += m[pos]/1000;
		}
	}
/*	filledT = (double)count / (double)(n6 * n6);
*/
	total0 = n6 * n6 - count;

	for(i=0;i<n6;i++){
		count = 0;
		for(j=0;j<n6;j++){
			pos = i*max + j;
			count += m[pos]/1000;
			pos = j*max + i;
			count += m[pos]/1000;
		}
		filled = (double)(count - 1) / (double)(2 * n6 - 1);
		line0 = 2 * n6 - count;

		if(filled >= _min_filled2){
			if( total0 < _min_zeros ) continue;
/* else was removed */
			if( line0 < _min_0_occup * total0 ) continue;
		}
		removed += 1;
		pID = a[k]->pID;
		p = FindNode(a,nodes,pID);
		if(p==0) return 0;
		if(a[p]->n3 == 0){
			fprintf(stderr,"Error in finding parent in RemoveUnrelated.\n");
			return 0;
		}
		n4 = a[p]->n4;

		j=FindNode(a,nodes,a[k]->sublist6[i]);
		if(j==k || j==0) continue;
		a[j]->n6 = 1;
		a[j]->sublist6[0] = a[j]->n0;
		a[k]->sublist6[i] = 0;

	/* recover the record in list4 of the parent */
		for(i2=0;i2<n4;i2++){
			if(a[p]->idlist4[i2] == a[j]->n0 && a[p]->reflist5[i2] == 0){
				a[p]->reflist5[i2] = a[j]->n0;
				break;
			}
		}

		a[k]->sublist6[i] = 0;
	}

	return removed;
}
/***************************************************************/


/***************************************************************/
void DiagonalMatrix(Node **a,unsigned nodes,unsigned k,int *m,unsigned max,int mode)
/* mode = 4 or 6. */
/***************************************************************/
{
	unsigned i,i2,j,n6;
	unsigned pos1,pos2;
	unsigned count,count2;
	unsigned b,c,d;
	unsigned jlimit=0;
	unsigned max_size = max * max;

	if(k > nodes || k==0 || max==0 ) return;
	if(mode != 4 && mode != 6) return;
	if(mode == 6) n6=a[k]->n6;
	else n6=a[k]->n4;

	if(n6 <= 1) return;
	if(n6 > max) return;
	
	for(i=n6-1;i>0;i--){
		b=i;
		count2 = 2000 * (i + 1);
		for(i2=jlimit;i2<=i;i2++){
			count = 0;
			for(j=0;j<=i;j++){
				pos1 = i2*max + j;
				pos2 = j*max + i2;
				count += m[pos1] + m[pos2];
			}
			if(count < count2){
				count2 = count;
				b=i2;
			}
		}
		if(b!=i){
			for(c=0;c<n6;c++){
				pos1 = b*max + c;
				pos2 = i*max + c;
				swapInt( m+pos1, m+pos2 );
			}
			for(c=0;c<n6;c++){
				pos1 = c*max + b;
				pos2 = c*max + i;
				swapInt( m+pos1, m+pos2 );
			}
			
			if(mode==6) swapUnsigned(&a[k]->sublist6[b],&a[k]->sublist6[i]);
			else swapUnsigned(&a[k]->idlist4[b],&a[k]->idlist4[i]);
		}

		d=jlimit;
		for(j=jlimit;j<i;j++){
			pos1 = j * max + i;
			pos2 = i * max + j;
			if(pos1 > max_size || pos2 > max_size){
				fprintf(stderr,"Error in DiagonalMatrix 1.\n");
				fprintf(stderr,"pos1=%u, pos2=%u, max=%u\n",pos1,pos2,max);
				exit(1);
			}
			if( m[pos1] != 0 && m[pos2] != 0 ) continue;
			for(c=0;c<n6;c++){
				pos1 = j*max + c;
				pos2 = d*max + c;
				swapInt( m+pos1, m+pos2 );
			}
			for(c=0;c<n6;c++){
				pos1 = c*max + j;
				pos2 = c*max + d;
				swapInt( m+pos1, m+pos2 );
			}
			if(mode==6) swapUnsigned(&a[k]->sublist6[j],&a[k]->sublist6[d]);
			else swapUnsigned(&a[k]->idlist4[j],&a[k]->idlist4[d]);
			d++;
		}
		jlimit = d;
	}
	return;
}
/***************************************************************/

/*    end of file matrix.c    */
