/*******************************************
 defines.h
Previously, lsort3.h
This program reads a list file (generated from BLAST result by bl2ls.pl),
and assembles similarity groups. Output is a *.grp file.
This is a C version of the lsort2b.pl.
  Copyright Naoki Sato 2002.

Added functionality of using catenated list file. June 2, 2002.
Added msort, lnkann. June 16, 2002.
Added SQlist. June 17, 2002.
Added lnkdb. June 25, 2002.
Selection of master entry. June 25, 2002.
Added hom. July 2, 2002.
Matrix is now in float. July 5, 2002.
Matrix is now in int. July 6, 2002.
score is now in float. July 6, 2002. Now double. May 14, 2003.
Added sublist5. July 7, 2002.
Phase 1. July 19, 2002.
RemoveUnrelated returns unsigned. Sept. 5, 2002.
Unnecessary output suppressed. Dec. 30, 2002.
Reconstruction of source code. April 30, 2003.
Recurrent clustering. May 16, 2003.
Gclust3 format. Dec. 25, 2003.
Start of version 308. Nov. 1st, 2004.
 Use of overlapscore2 in creating cluster by MergeList.
 This is done using the flag active. Nov. 1st, 2004.
Gclust3.5	Dec. 14, 2004.
 Major revision: 
  1. Domain definition.
  2. Clustering of quasi complete cliques using E-value, overlapscore, and domain composition.
  3. Addition of marginal members and identification of multidomain members.
 Bug fixes:
  1. PrintTable3 for printing table in gclust3 format.
Gclust 3.5.2 use of organism information.
  Relatives.
  Repeated domains.
 nocalc mode.
Gclust 3.5.2 f1
  Modification in SearchBridge.
  MergeDistantRelatives.
Gclust 3.5.3
  Use of information content in organism-optimized clustering.
  Variable level of regrouping.
*******************************************/
/*
#define VERSION "3.5.5"
*/
#include "version.h"
#include <stdlib.h>
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <string.h>

#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif

#ifndef EOF
#define EOF (-1)
#endif

#ifndef Boolean
#define Boolean char
#endif

#define GN_MAX 1000	/* maximum value for gn_idx */
#define MAXLEN 200
#define MAXLINE 1024
/* Many of the following variables are tunable in the file var_list */
#define LARGEPROTEIN 2000	/* proteins larger than this size will be treated as large prootein */
#define MAX_N3 35000	/* maximum value for n3. */
#define MAX_MATRIX 10000	/* maximum value for matrix size. Above this, proteins are defines as multidomain. */
#define LEVEL1 0.90	/* minimal overlap for identical region */
#define LEVEL2 1.7	/* minimal relative size for implication. */
#define LEVEL3 0.20	/* minimal level of overlap for domain definition in SearchDomain */
#define LEVEL4 0.50	/* minimal level of overlap for domain identification SearchDomain */
#define ANNOTLEN 50	/* length limit of annotation line */
#define MATCHLEVEL 0.20	/* minimal overlap value for single domain proteins. */
#define MINLEVEL 0.10	/* minimal overlap value for identifying maximal similarity in MakeSublist. */
#define MIN_FILLED 0.45 	/* minimal rate of filled per whole matrix */
#define MIN_FILLED2 0.35 	/* minimal rate of filled per line */
#define MIN_ZEROS 10	/* minimal number of 0 in a matrix in RemoveUnrelated */
#define MIN_0_OCCUP 0.80 	/* minimal occupancy of 0 in a line per whole matrix */
#define MIN_INCREASE 1.5	/* minimal value for the increase of n4 with varying thr */
#define DEFAULT_THR 0.998
#define NUMBER_OF_GENOMES 20	/* default number of species to be used in repeat mode */
#define MIN_DOMAIN_SIZE 50	/* minimal size of domain for domain identificaiton */
/* #define STANDARD_MATRIX_SIZE 60 */	/* 350. matrix size of largest cliques. Three times number of genomes */
#define MIN_OVERLAP 4	/* 352. minimum level of overllap to be considered */
#define LEVEL_F1 0.5	/* 352e10. parameters in OptimizeOrg in anal.c */
#define LEVEL_F2 0.9
#define LEVEL_F3 0.3
#define REGROUP_LEVEL 5	/* 353f */
/* 353h: The following two parameters are used in search_local_maxima in the org mode.
   delta_plus is added to delta_min if (delta_max - delta_min) is larger than delta_diff.
   This increases the size of initial cluster, but the effect on the final clustering may
   depend on situations. */
#define DELTA_DIFF 3
#define DELTA_PLUS 1
/* 353h: The following four parameters are used in CountOrgInIDlist in the org mode.
   org_measure = _org_measure_0 - _org_measure_1 * _regroup_level;
   allowance_level = _allowance_level_0 + _allowance_level_1 * _regroup_level;
   These two parameters are used to judge regrouping. */
#define ORG_MEASURE_0 5.0
#define ORG_MEASURE_1 0.3
#define ALLOWANCE_LEVEL_0 0.65
#define ALLOWANCE_LEVEL_1 0.025
/* 355k: The following two parameters are used in CountOrgInID.
 * JS_level = _JS_level_0 + _JS_level_1 * _regroup_level;
*/
#define JS_LEVEL_0 0.07
#define JS_LEVEL_1 0.01
/* 353h: The following four parameters are used in MergeDistantRelatives.
   level1 = _MDR_level1_0 - _MDR_level1_1 * _regroup_level;
   max_cluster = (_MDR_max_cluster_0 + _MDR_max_cluster_1 * _regroup_level) * number_of_genomes;
   These two parameters are used to judge regrouping. */
#define MDR_LEVEL1_0 0.95
#define MDR_LEVEL1_1 0.03
#define MDR_MAX_CLUSTER_0 1.25
#define MDR_MAX_CLUSTER_1 0.10
#define MDR_LENGTH1 70
#define MDR_LENGTH2 120
#define MDR_SCORE1 1e-30
#define MDR_SCORE2 1e-45
#define MDR_CLIQUE_SIZE 3
#define MDR_CROSS_EDGE_PARAMETER 0.40
#define CC_LEVEL_2 0.9	/* CreateCliques */
#define CC_LEVEL_3 0.2
#define CC_MIN_THR 1e-40
#define MIN_DELTA_OVERLAP 0.2	/* 355y DeltaTest */

/* The following variables define file header and file names. */
#define GCLUST_TABLE "GCLUST_TABLE"
#define GCLUST_TABLE2 "GCLUST_TABLE2"
#define GCLUST_TABLE3 "GCLUST_TABLE3"
#define NEWTABLE "new.g.table"
#define MISSINGTABLE "missing.list"
#define GCLUST_DATA "data.out"
#define GCLUST_VARIABLES "GCLUST_VARIABLES"
#define GCLUST_VARIABLES_FILE "var_list"
#define ORG_LIST_FILE "org_list"

/* Additional variables */
#define SUPPRESS_LARGE_MATRIX 1000	/* suppress printing matrices larger than this size */
#define STACK_SIZE 1000		/* size of stack and count in list.c */
#define HIGHEST 5	/* highest level of max_count in list.c RemoveMultiDomain */
#define MAX_OCCUPANCY 0.9	/* minimum level a single nodeID within a single idlist4 in RemoveMultiDomain. */
#define MIN_OCCUPANCY 0.1	/* minimum level of one of the thre major nodeIDs within an idlist4 in RemoveMultiDomain. */
#define SEARCHBRIDGE_MIN_TRIANGLE 0.4	/* minimum occupancy of the major domains within triangle in SearchBridge */
#define SEARCHBRIDGE_MIN_N4 8	/* minimum n4 for identifying multidomain in SearchBridge */

/* Start of definition of structures */
typedef struct{
	unsigned ID;	/* subject ID */
	unsigned Sstart;
	unsigned Send;
	double score;
	unsigned qID;	/* query ID */
	unsigned Qstart;
	unsigned Qend;
	unsigned domain;	/* ID of domains */
	Boolean crosslink;
} SQlist;

typedef struct{
	unsigned nodeID;
	unsigned start;
	unsigned end;
	unsigned long ID;
	unsigned long pDM_ID;	/* 351a4 domain ID in parent node, for multidomain proteins. */
} DMlist;

typedef struct{
	int count;
	unsigned nodeID;
	unsigned long ID;
	unsigned IDcount[40];
} Stack;

typedef struct{
	char name[10];
	unsigned begin;
} gn_idx;

typedef struct{
	unsigned ID;
	double score;
	double overlap;
	unsigned long dmID;
} Match;

typedef struct{
	char prefix[20];
	char name[100];
	int species;
	int kingdom;	/* 1 Prokaryotes, 2 Archaea, 3 Eukaryotes */
} org_list;

typedef struct{
	unsigned grpno;
	unsigned pID;
	unsigned ID;
	unsigned count;		/* 355n */
} Relatives;	/* 352c */

typedef struct{
	char name[100];
	char annot[ANNOTLEN + 1];
	char DBname[30];
	int species;	/* 352 Species names are defined in org_list. */
	int kingdom;	/* 352  1 Prokaryotes, 2 Archaea, 3 Eukaryotes, 7 Mitochondria 11 Plastids 19 Nucleomorphs */
	char domain;	/* 0 for none, 1 for multidomain, 2 for fragment, 4 for large protein */
	unsigned len;
	unsigned corr_length;	/* 355t length without transit sequence */
	Boolean active;	/* TRUE by default. Inactivated if defined as multidomain, etc. But may be re-activated finally. */
	unsigned n0;
	unsigned n1;
	unsigned n1b;	/* backup copy of original n1 */
	unsigned n2;
	unsigned n2b;	/* backup copy of original n2 */
	unsigned len3;	/* allocated size of sqlist3 */
	unsigned n30;	/* number of unique IDs. */
	unsigned n3;
	unsigned n3_bak;	/* hidden copy of n3 */
	unsigned n3_old;	/* backup copy of n3 in a previous iteration */
	unsigned n3b;	/* backup copy of n3 */
	unsigned n3b_bak;	/* hidden copy of n3b */
	unsigned n3b_old;	/* backup copy of n3b in a previous iteration */
	unsigned n4;
	unsigned n4b;	/* backup copy of n4 with previous thr */
	unsigned n5;
	unsigned n6;
	unsigned n7;
	unsigned nR;	/* number of relatives. 352c */
	unsigned pID;	/* ID of the parent */
	unsigned pDM;	/* domain assignment with respect to the domains in the parent */
	unsigned grpno;
	unsigned subgrpno;
	unsigned DMnum;		/* number of domains */
	unsigned long maskID;
	unsigned effDMnum;		/* effective number of domains that are shared by other nodes */
	unsigned DMmax;		/* maximum number of domain ID */
	DMlist *dmlist;		/* domain list */
	double final_thr;	/* final value of thr */
	double overlap;	/* minimum overlap value, 351 */
	int delta_max;
	SQlist *sqlist1;
	SQlist *sqlist2;
	SQlist *sqlist3;
	SQlist *sqlist3b;	/* backup copy of sqlist3 with previous thr */
	unsigned *idlist4;
	unsigned *idlist4b;	/* backup copy of idlist4 with previous thr */
	unsigned *reflist5;
	unsigned *sublist6;
	Match *matchlist7;
	Relatives *relatives;	/* 352c */
	unsigned *rel_list;		/* 355v */
	int max_rel;
} Node;

typedef struct{
	SQlist *s;
	unsigned n;
} Unit;

typedef struct{
	unsigned start;
	unsigned end;
} Region;

typedef struct{
	double average;
	double Sw;
} Av_Sw;

typedef struct{
	unsigned Al;
	unsigned Ar;
	unsigned Bl;
	unsigned Br;
	unsigned overlap;
} DMlr;

#ifndef __MAIN
extern gn_idx gi[GN_MAX];
extern unsigned max_gi;
extern char *optVar[];
extern unsigned maxVar;
extern unsigned grpno;
extern unsigned subgrpno;
extern Region *R,*R1,*R2;
extern Unit **u;
extern Unit uu;
extern unsigned mat_size;
extern unsigned initial_mat_size;
extern Boolean no_sub;
extern double thr;
extern double last_thr;
extern unsigned suppress_large_matrix;
extern unsigned number_of_genomes;
extern unsigned *ch;
extern Boolean gclust3;
extern int *xy,*z,*sp,*d;
extern org_list *org;
extern int num_org,num_org_all,num_entries;
/* tunable variables */
extern int _min_overlap;
extern unsigned _largeprotein;
extern unsigned _max_n3,_max_matrix;
extern double _level1;
extern double _level2;
extern double _matchlevel;
extern double _minlevel;
extern double _min_filled;
extern double _min_filled2;
extern unsigned _min_zeros;
extern double _min_0_occup;
extern double _min_increase;
extern double _level3;
extern double _level4;
extern unsigned _min_domain_size;
extern unsigned _stack_size;
extern double _searchbridge_min_triangle;
extern unsigned _searchbridge_min_n4;
/* extern unsigned _standard_matrix_size;*/
extern Boolean _print_all_domains;
extern Boolean print_2D_tables;
extern double _f1,_f2,_f3;	/* OptimizeOrg in anal.c, 352e10 */
extern int _regroup_level;	/* 353f */
extern int _delta_diff;
extern int _delta_plus;
extern double _org_measure_0;
extern double _org_measure_1;
extern double _allowance_level_0;
extern double _allowance_level_1;
extern double _JS_level_0;
extern double _JS_level_1;
extern double _MDR_level1_0;
extern double _MDR_level1_1;
extern double _MDR_max_cluster_0;
extern double _MDR_max_cluster_1;
extern unsigned _MDR_length1;
extern unsigned _MDR_length2;
extern double _MDR_score1;
extern double _MDR_score2;
extern int _MDR_clique_size;
extern double _cross_edge_parameter;
extern double _CC_level_2;
extern double _CC_level_3;
extern double _CC_min_thr;
extern double _min_delta_overlap;

extern int num_thr;
extern double thr_list[50];
extern unsigned dist_score[21];
extern unsigned num_score;
extern unsigned long bit2[];
extern Boolean repeat_mode;       /* repeat mode. */
extern Boolean clique_mode;       /* 350 clique mode. */
extern Boolean verbous_mode; /* verbous output of list of n4 */
extern Boolean use_org;	/* 352 */
extern Boolean ashikiri;	/* 352e14 */
extern Boolean error_expandDMlist;
extern unsigned _max_occupancy;
extern unsigned _min_occupancy;

extern Boolean print_N4_table;

#endif

/**********************************************/
/* end of file defines.h */
