New additions in 354. May 29, 2007.

main.c

** in the input routine
			} else if(!strcmp(argv[count],"-m8")) {			/* 354 */
				m8_mode = TRUE;
** after reading annotation
		if(!gclust3) {
			printf("\nTotal genomes: max_gi=%u.\n",max_gi);
			if(m8_mode){		/* 354 */
				fprintf(stderr,"The m8 mode is only used with the annotation table in the gclust3 format.\n");
				exit(1);
			}
		}
** reading list file
		if(m8_mode){		/* 354 */
			if((a=ReadNodeM8(fin,a,&nodes))==NULL){
				fprintf(stderr,"Read error.\n");
				exit(1);
			}

node.c

/***************************************************************/
Node **ReadNodeM8(FILE *fin,Node **a,unsigned *nodes_p)
/* This is used to read the -m8 table directly.
Version 354. May 29, 2007  */
/***************************************************************/
{
	char str1[MAXLINE + 2];
	char str2[MAXLINE + 2];
	char str5[MAXLINE + 2];
	char *q;
	unsigned len=MAXLINE;	/* length of line1 is now a single line of BLAST output */
	unsigned curr_len;
	unsigned i,k;
	unsigned begin;
	unsigned current_i;
	long fileloc;		/* pointer to the current location within the file */
	char item[MAXLINE];		/* gxxxxx number */
	unsigned counter; 
	unsigned new_n1;
	unsigned current_n1;


	InitClist(*nodes_p);

	while(fgets(str1,MAXLINE,fin)!=NULL){
		if(!strcmp(str1,"\n")) continue; 

		sscanf(str1,"%s",str2);
		if(*str2 == 'g'){
			i = atol(str2+1);
			if(current_i != i) {
				PrintProgress(i,10,1000);
				strcpy(item,str2);
				current_i = i;
				counter = 1;
				fileloc = ftell(fin);
			/* enough memory is allocated to sqlist1 */
				strcpy(str5,"");
				while(fgets(str5,MAXLINE,fin)!=NULL){
					if(!strcmp(str5,"\n")) continue; 
					sscanf(str5,"%s",str2);
					if(strcmp(item,str2)) break;
					counter += 1;
				}
				if(fseek(fin,fileloc,SEEK_SET)){
					fprintf(stderr,"File seek error.\n");
				}
				current_n1 = a[i]->n1;
				new_n1 = current_n1 + counter;
/*
		printf("i=%u, counter=%u. current_n1=%u, new_n1=%u.\n",i,counter,current_n1, new_n1);
*/
				if((a[i]->sqlist1=(SQlist*)realloc(a[i]->sqlist1,(new_n1 + 1)*sizeof(SQlist)))==NULL){
					fprintf(stderr, "Memory allocation error in ReadNodeM8\n.");
					return NULL;
				}
				for(k=current_n1+1;k<=new_n1;k++){
					clearSQlist(&a[i]->sqlist1[k]);
				}
				a[i]->sqlist1[0].ID = i;
				a[i]->sqlist1[0].qID = i;
				a[i]->sqlist1[0].Sstart = 1;
				a[i]->sqlist1[0].Qstart = 1;
				a[i]->sqlist1[0].Send = a[i]->len;
				a[i]->sqlist1[0].Qend = a[i]->len;
				if(a[i]->len > _largeprotein){
					a[i]->domain |= 0x4;
					printf("Node %u: %s is being set as a large protein. Size=%u\n",i,a[i]->name,a[i]->len);
				}
				a[i]->n1 = 0;
			}

		} else {
			fprintf(stderr,"Sequence file is not in Gclust3 format.\n");
			exit(1);
		}

		if((a=WriteNodeM8(a,i,nodes_p,str1))==NULL){
			exit(1);
		}
		ch[i] = TRUE;
		strcpy(str1,"");
	}


/* check of entries 353i */
	for(i=1;i<=*nodes_p;i++){
		if(!ch[i]){
			fprintf(stderr,"Entry %u is missing in input file, but we try to recover.\n",i);
			/* For the entries, which have no homologous sequences, even with themselves. */
			if(a[i]->n1 == 0) {
				a[i]->sqlist1[0].ID = i;
				a[i]->sqlist1[0].qID = i;
				a[i]->sqlist1[0].Sstart = 1;
				a[i]->sqlist1[0].Qstart = 1;
				a[i]->sqlist1[0].Send = a[i]->len;
				a[i]->sqlist1[0].Qend = a[i]->len;
				a[i]->n1 = 1;
				ch[i] = TRUE;
			}
		}
	}
	InitClist(*nodes_p);

	return a;
}
/***************************************************************/

/**************************************************************/
Node **WriteNodeM8(Node **a,unsigned i,unsigned *nodes_p,char *line1)
/* This is used to read the -m8 table directly. */
/**************************************************************/
{
	char str1[MAXLEN];
	char *p;
	unsigned num=0;		/* number of items in line1 */
	Boolean flag=FALSE;
	unsigned count=0;
	unsigned k;
	unsigned curr_n1=0;
	double tmp=0.0;
	Boolean to_skip=FALSE;
	Boolean addition_mode=FALSE;	/* The first item has been set */

	if(i>*nodes_p){
		fprintf(stderr, "Node member must be less than nodes in WriteNode.\n");
		return NULL;
	}
	if(a[i]->sqlist1[0].ID == 0) addition_mode=TRUE;

	sscanf(line1,"%s",str1);
	if(!strcmp(a[i]->name,"")){
		fprintf(stderr, "Empty node name in WriteNodeM8.");
		fprintf(stderr, "i= %u.\n",i);
		fprintf(stderr, "line1= %s.\n",line1);
		return NULL;
	}

	curr_n1 = a[i]->n1;
	a[i]->n1 += 1;

	p=strtok(line1,"\t");	/* Query ID */
	a[i]->sqlist1[curr_n1].qID = i;

	p=strtok(NULL,"\t");	/* Subject ID */
	if(*p == 'g'){
		a[i]->sqlist1[curr_n1].ID = atol(p+1);
	}else{
		fprintf(stderr,"Error in subject ID.\n");
		return NULL;
	}

	p=strtok(NULL,"\t");	/* item 2 */
	p=strtok(NULL,"\t");	/* item 3 */
	p=strtok(NULL,"\t");	/* item 4 */
	p=strtok(NULL,"\t");	/* item 5 */

	p=strtok(NULL,"\t");	/* Q start */
	a[i]->sqlist1[curr_n1].Qstart = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* Q end */
	a[i]->sqlist1[curr_n1].Qend = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* S start */
	a[i]->sqlist1[curr_n1].Sstart = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* S end */
	a[i]->sqlist1[curr_n1].Send = (unsigned)atol(p);

	p=strtok(NULL,"\t");	/* E-value */
	tmp = atof(p);
	if(tmp >= thr * 0.999) to_skip=TRUE;
	else a[i]->sqlist1[curr_n1].score = tmp;

	return a;
}
/**************************************************************/


/***************************************************/
org_list *ReadOrgFile(FILE *fin)
/* organism definition is read from file org_list.  
*/
/***************************************************/
{
	char str1[MAXLINE + 2];
	char org_list_file[20];
	char prefix[20];
	char name[100];
	int kingdom,species;
	int i;
	org_list *org;
	char *p;

	strcpy(org_list_file,ORG_LIST_FILE);
	if(fgets(str1,MAXLINE,fin)==NULL){
		fprintf(stderr,"Unable to read org_list.\n");
		return NULL;
	}
	sscanf(str1,"%d\t%d",&num_entries,&num_org);
	if((org = (org_list*)calloc(num_entries + 1,sizeof(org_list)))==NULL){
		fprintf(stderr,"Memory allocation error in ReadOrgFile.\n");
		return NULL;
	}
	
	i=0;
	while(fgets(str1,MAXLINE,fin) != NULL && strncmp(str1,"END",3)){
		for(p=str1;*p!='\0';p++){
			if(*p == ' ') *p = '_';
		}
		if(i++ >= num_entries) break;
		if(str1 == "#") continue;	/* comment line, May 29, 2007 */
		if(strcmp(str1,"")==0) continue;
		sscanf(str1,"%s\t%s\t%d\t%d",prefix,name,&species,&kingdom);
/*		printf("%d ",i);
 */
		for(p=name;*p!='\0';p++){
			if(*p == '_') *p = ' ';
		}
		if(i>0 && i<=num_entries){
			strcpy(org[i].prefix,prefix);
			strcpy(org[i].name,name);
			org[i].species = species;
			org[i].kingdom = kingdom;
/*			printf("%s\t%s\t%d\t%d\n",org[i].prefix,org[i].name,i,org[i].kingdom);
*/
		}
	}

	return org;
}
/***************************************************/


help.c

/***************************************************************/
void help_gclust(void)
/***************************************************************/
{
	printf("*************************** GCLUST HELP ***********************************\n");
	printf("Usage: gclust listfile -option1 -tab=TABLE -option2 ...\n\n");
	printf("Usage: gclust -read=data.out -option1 -option2 ...\n\n");
	printf("       This program reads a list file prepared by the bl2ls2.pl, \n");
	printf("       and converts it to a group file.\n");
	printf("       options:  hom, homsub, save, nocalc.\n");
	printf("       TABLE should be in a new format, with the number of sequences\n");
	printf("       in the first line.\n");
	printf("       -read=data.out: read output file produced by the -save option.\n");
	printf("       -out=1, r, or s: 1-0, homology region, similarity score.\n");
	printf("            In the save mode, this option is not necessary.\n");
/*	printf("       -mat=1 or M: single or multiple.\n");
 mat option is not used now*/
	printf("       -thr=threshold E-value.\n");
	printf("       -t	print a new table in the gclust2 format, for a rapid reading.\n");
	printf("       -save	print all data to a file, for rapid processing next time.\n");
	printf("       -nocalc	no calculation. This is used to print variables.\n");
	printf("       -taper	remove low level homology from normal size entries.\n");
	printf("       -v	print variables to a file var_list.new.\n");
	printf("       -repeat	repeat mode with changing threshold.\n");
	printf("       -clique	clique mode. This is new from version 3.5.\n");
	printf("       -org	Use organism list in org_list. New from 3.5.2\n");
	printf("       -regroup	Re-grouping before final output. New from 3.5.2\n");
	printf("       -verbous	verbous mode in which the list of n4 is printed. Only in repeat mode\n");
	/* currently, no additional function is assigned to verbous mode. */
	printf("       -printN4	print N4 table for debugging. Only in repeat mode\n");
	printf("       -2Dtable	print 2D tables of homologs. Only in clique mode\n");
	printf("       -m8	read the -m 8 table of BLAST output. This is used with \n");
	printf("          	an annotation table in the gclust3 format.\n");
	printf("****************************************************************************\n");
	return;
}
/***************************************************************/

end of new additions.

