/* A software package to align alignments within a specified region. */ /* Revised Nov. 6, 1993 (Alignment array --> Alignment Script) */ /* Revised Nov. 10, 1993 (Restrict yama2 to interesting regions only) */ #include #include #define max(x,y) ((x) >= (y) ? (x) : (y)) #define min(x,y) ((x) <= (y) ? (x) : (y)) #define DEFAULT_LL 50 /* output line length */ #define CHAR_SIZE 128 /* Maximum Alphabet Size */ #define SIGMA 8 /* Alphabet Size */ #define SYMA 0 #define SYMC 1 #define SYMG 2 #define SYMT 3 #define SYMN 4 #define DASH 5 #define LSTAR 6 #define RSTAR 7 #define SUB(a,b) PD(i,j,(a),(b)) = PD(i,j,(b),(a)) = \ PD(j,i,(a),(b)) = PD(j,i,(b),(a)) #define SUBI(a,b) PD(i,j,(a),(b)) = PD(j,i,(b),(a)) #define SUBJ(a,b) PD(j,i,(a),(b)) = PD(i,j,(b),(a)) #define DAG(a) PD(i,j,(a),(a)) = PD(j,i,(a),(a)) #define DT(x) PT[i][j]x = PT[j][i]x #define DTI(x) PT[i][j]x #define DTJ(x) PT[j][i]x #define NUMBER 100 /* Maximum number of pieces */ #define ANUMBER 12 /* Maximum number of species */ #define MININT -9999999 #define IPL 25 /* Input Piece Length */ #define TRUE 1 #define FALSE 0 #define DNODE 0 /* Diagonal Node */ #define VNODE 1 /* Vertical Node */ #define HNODE 2 /* Horizontal Node */ #define XNODE 3 /* Wild Type */ #define WIDTH 10 /* Default bandwidth */ /* #define USAGE "Usage: yama2 [-r ALL_SIM] [-w width] [-e ] [-c ] [-t] [L=??] " */ #define USAGE "Usage: yama2 [-r ALL_SIM] [-w width] [-c ] [-t] [L=??] " /* score file format : seq1 seq2 60 2 8 -10 seq2 seq3 50 2 10 -10 default 60 2 10 -10 seq2 3 2 4 3 default 0 0 0 0 Default : to output LAT-format text -t : normal text output L : text length input sequence order format: "((seq1[20,50] seq2[30,45]) seq3[20,50])" output scripts: lat header ... l script1 script2 ... first sequence l script1 script2 ... second sequence l script1 script2 ... third sequence ... positive script means # of characters negative script means # of gap symbols (dashes) */ /* char PD(NUMBER+1,NUMBER+1,SIGMA,SIGMA); */ /* symbol distance */ static char *PD_storage; /* symbol distance */ static int K; /* number of pieces */ static int AK; /* number of sequences */ /* Macros to define the scoring matrix */ #define PD_SIZE(K) (sizeof(char) * SIGMA * SIGMA * ((K)+1) * ((K)+1)) #define PD_ALLOC(K) do{PD_storage = (char*)ckalloc(PD_SIZE(K));}while(0) #define PD(a,b,c,d) PD_storage[(d) + SIGMA*((c) + SIGMA*((b) + (AK+1)*(a)))] #define PD2(a,b) (PD_storage + SIGMA*SIGMA*((b) + (AK+1)*(a))) static char PT[ANUMBER+1][ANUMBER+1][2][2][4][4]; /* Altschul gap counts */ static char *S [NUMBER+1]; /* sequences */ static int L [NUMBER+1]; /* lengths of sequences */ static int From[NUMBER+1]; /* Starting position */ static int To[NUMBER+1]; /* End position */ static char *Sname[NUMBER+1]; /* sequence piece name */ static char *ASname[ANUMBER+1]; /* sequence name */ static int StoA[NUMBER+1]; /* piece to sequence */ static char Ename[30]; /* endpoints file name */ static char Rname[30]; /* region file names */ static char tflag = 1; /* lat output format */ static char cflag = 0; /* 0 if score file not specified */ static char eflag = 0; /* 1 if endpoints file specified */ static int LL; /* line length */ static char Encode[CHAR_SIZE]; /* A --> SYMA ... */ static char Decode[CHAR_SIZE]; /* SYMA --> A ... */ int *Seq_Script[NUMBER+1]; /* aligned sequence script */ int Entry_No[NUMBER+1]; /* Number of script entries */ int *LB, *RB; /* Left and Right Boundaries */ static int lmost, rmost; /* Leftmost and Rightmost of the region */ static int sl; /* total length of the alignment */ static int d_no; /* number of delete operations in ALS */ static int i_no; /* number of insert operations in ALS */ static int r_no; /* number of insert operations in ALS */ static int *sapp; /* Current script append ptr */ static int last; /* Last script op appended */ /* Append "Delete k" op */ #define DEL(k) \ { sl += k; \ if (last < 0) \ last = sapp[-1] -= (k); \ else { \ last = *sapp++ = -(k); \ ++d_no; \ } \ } /* Append "Insert k" op */ #define INS(k) \ { sl += k; \ if (last > 0) \ last = sapp[-1] += (k); \ else { \ last = *sapp++ = (k); \ ++i_no; \ } \ } #define REP { ++sl; last = *sapp++ = 0; ++r_no;} /* Append "Replace" op */ /* - -> * for some cases */ #define ET(ii,jj) ((ii == DASH)? jj : ii) /* compute the value for gap table T */ /* 0 : DASH; 1 : letter; 2 : LSTAR ; 3 : RSTAR */ #define GV(ii) ((ii==DASH)? 0 : ((ii==LSTAR)? 2 : ((ii==RSTAR)? 3 : 1))) #define ETGV(kk,ii,jj) { \ kk = ET(ii,jj); \ kk = GV(kk); \ } static int *C1, *C2; /* Precomputed Column scores for two alignments */ static int *RH, *RV, *RD; /* score vectors */ static char **E2; /* End gaps indicator */ static char **VE2; /* End gaps indicator */ /* aligning tree */ typedef struct tnode *Tptr; typedef struct tnode { Tptr left; /* left link */ Tptr right; /* right link */ int L; /* leftmost sequence */ int R; /* rightmost sequence */ char **AL; /* return alignment */ int N; /* alignment length */ } Tnode; char order[NUMBER*IPL]; /* alignment order */ void main(), data(), StoAL(), MALStoMAL(), MDisplay(), LDisplay(), SDisplay(), order_print(), print_score(); Tptr build_tree(); int traverse_tree(); void Read_regions(), LR(); void main(int argc, char *argv[]) { FILE *stream, *ep, *fp, *lav_fp, *ckopen(); int e, m, v; char cname[30], bname[30], *sname, fname[30], *get_seqname(); char **AL; int *ALS; char *ckalloc(); int i,j; int c; Tptr tp; int bd,ip, width, from, to; /* gflag = 0; */ strcpy(Rname,"ALL_SIM"); width = WIDTH; LL = DEFAULT_LL; bd = 0; ip = 2; if (argc <= 1) fatal(USAGE); for ( ++argv; --argc; argv++ ) if ( (*argv)[0] == '-' ) { /* optional parameters */ /* if ( (*argv)[1] == 'g' ) { if ( (*argv)[2] == '2' ) gflag = 2; else gflag = 1; } else if ( (*argv)[1] == 't' ) tflag = 0; */ if ( (*argv)[1] == 't' ) tflag = 0; /* text output */ else if ( (*argv)[1] == 'w' ) { /* width is provided */ ++argv; --argc; sscanf(*argv,"%d",&width); } else if ( (*argv)[1] == 'd' ) { /* block distance is provided */ ++argv; --argc; sscanf(*argv,"%d",&bd); ++argv; --argc; sscanf(*argv,"%d",&ip); } else if ( (*argv)[1] == 'c' ) { /* score file is provided */ ++argv; --argc; sscanf(*argv,"%s",cname); score_name(cname); cflag = 1; } else if ( (*argv)[1] == 'r' ) { /* ALL_SIM file is provided */ ++argv; --argc; sscanf(*argv,"%s",Rname); } else if ( (*argv)[1] == 'e' ) { /* end points and block file are provided */ ++argv; --argc; sscanf(*argv,"%s",Ename); eflag = 1; } else fatal(USAGE); } else if ((*argv)[0] == 'L' && (*argv)[1] == '=') LL = atoi(*argv + 2); else { strcpy(order,*argv); tp = build_tree(order); } /* for (i=1; i<=K; ++i) printf("Seq %d: Length: %d %s\n",i,L[i],S[i]+1); */ /* if (!eflag) fatal("block file is not provided!"); ep = ckopen(Ename,"r"); Read_blocks(ep,bd,ip,Sname,K); fclose(ep); */ /* Read_regions(Rname,width,Sname,K,From,To); */ /* get all alignments */ get_all(Rname,width,Sname,From,To); /* get sequence names, July 13, 1993 */ fp = ckopen(Rname,"r"); AK = 0; while (fscanf(fp,"%s",&fname) != EOF) { lav_fp = ckopen(fname,"r"); lav_read_init(lav_fp); for (j=0; j<2; ++j) { sname = get_seqname(&from, &to, 0); i = 0; while (i=AK) { ASname[i] = sname; ++AK; } } fclose(lav_fp); } fclose(fp); /* Compute StoA */ for (i=1; i<=K; ++i) { j = 0; while (j=AK) fatal("Alignment files are not enough."); } PD_ALLOC(AK); /* allocate space for cost matrices */ if (tflag) { printf("#:lat\n\n"); printf("d {\n \"YAMA2 output with parameters:\n"); } /* print out the aligning order */ order_print(order); /* if (cflag) printf(" score file: %s\n",cname); else printf(" default score file used\n"); */ data(); /* get input scoring scheme */ script_init(); /* initialize the scripts */ c = traverse_tree(tp); /* traverse the aligning tree */ /* get the align_score */ /* Notice that alignment score returned by traverse_tree may not be the same as align_score. This is beacause some quasi-gaps and end-gaps are not penalized in the alignment score. */ c = align_score(K-1,tp->N); if (tflag) { /* Lat output */ printf(" \"\n"); printf("}\n"); SDisplay(c); /* or lav output LDisplay(tp->AL,K-1,tp->N,c); */ } else { /* texture output */ printf("\n alignment score = %d\n\n",c); script2AL(tp->L, tp->R-tp->L, &tp->AL, tp->N); MDisplay(tp->AL,K-1,tp->N); free(tp->AL[0]); free(tp->AL); } exit(0); } /* order_print - print out the sequence order */ void order_print(char *order) { char *op, buf[80], flag; int LL = 60, IL = 54, i; int line_no=0; flag = TRUE; while (flag) { while (*order == ' ') ++order; i = 0; op = order; if (++line_no > 1) LL = IL; while ((*op++ != '\0') && (i++ < LL)); op--; if (*op == '\0') flag = FALSE; if (i > 0) { while (*op != ']' && *op !=')' && *op != '\0' && i-- > 0) op--; if (i <= 0) { flag = FALSE; if (line_no == 1) printf(" order=%s\n",order); else printf(" %s\n",order); printf("Last line is too long!\n"); } else { strncpy(buf,order,i); buf[i] = '\0'; if (line_no == 1) printf(" order=%s\n",buf); else printf(" %s\n",buf); } } order = ++op; } } /* build_tree - return the root of the aligning tree */ Tptr build_tree(char *order) { Tptr Stack[NUMBER], p; int i,j,k; int sp; char temp[20]; char *ckalloc(); char *tname[NUMBER+1]; /* sequence name */ int tfrom[NUMBER], tto[NUMBER], tK; char *get_seqname(); FILE *fp, *ckopen(); int strsame(); char fname[40]; if (eflag) { /* read end points */ fp = ckopen(Ename,"r"); lav_read_init(fp); tK = 0; while ((tname[tK] = get_seqname(&tfrom[tK], &tto[tK],0))) if (++tK > NUMBER) fatal("The maximum number of sequences is %d",NUMBER); fclose(fp); } sp = 0; K = 0; i = 0; while (order[i] != '\0') { switch (order[i]) { case ')': if (sp < 2) fatal("wrong input format!"); p = (Tptr)ckalloc(sizeof(Tnode)); p->right = Stack[--sp]; p->left = Stack[--sp]; p->L = (p->left)->L; p->R = (p->right)->R; Stack[sp++] = p; case '(': case ']': case ' ': case '\t': ++i; break; default: if (++K > NUMBER) fatal("The maximum number of sequences is %d",NUMBER); j = strcspn(order+i,"( )\0\t"); strncpy(fname,order+i,j); /* printf("fname\n",K,fname[K]); */ i += j; L[K] = get_seq(fname,&S[K],&Sname[K],&From[K],&To[K]); S[K]--; /* printf("K=%d, From=%d, To=%d\n",K,From[K],To[K]); fflush(stdout); */ p = (Tptr)ckalloc(sizeof(Tnode)); p->left = p->right = NULL; p->L = p->R = K; Stack[sp++] = p; break; } } if (sp != 1) fatal("wrong format!"); return Stack[0]; } /* traverse_tree - traverse the aligning tree */ int traverse_tree(Tptr tp) { int *ALS; int SL, i, l, r, n, tb; int D_NO, I_NO, R_NO; int c; int c1, c2; Tptr p,q; char *ckalloc(); p = tp->left; q = tp->right; if (p->L == p->R) { /* one to many */ /* p->AL = (char **) ckalloc((L[p->L]+1)*sizeof(char *)); p->AL[0] = (char *) ckalloc(L[p->L]+1); for (i=1; i<=L[p->L]; ++i) p->AL[i] = (p->AL[i-1])+1; StoAL(S[p->L],L[p->L],p->AL); */ p->N = L[p->L]; c1 = 0; } else c1 = traverse_tree(p); if (q->L == q->R) { /* many to one */ /* q->AL = (char **) ckalloc((L[q->L]+1)*sizeof(char *)); q->AL[0] = (char *) ckalloc(L[q->L]+1); for (i=1; i<=L[q->L]; ++i) q->AL[i] = (q->AL[i-1])+1; StoAL(S[q->L],L[q->L],q->AL); */ q->N = L[q->L]; c2 = 0; } else c2 = traverse_tree(q); /* printf("(%d %d)",tp->L,tp->R); */ ALS = (int *)ckalloc((p->N + q->N +1)*sizeof(int)); LB = (int *) ckalloc((p->N + 1) * sizeof(int)); RB = (int *) ckalloc((p->N + 1) * sizeof(int)); /* for (i=0; i<=p->N; ++i) { LB[i] = ((i*8 <= q->N)? i*8 : q->N); RB[i] = ((LB[i]+100 <= q->N)? LB[i]+100 : q->N); printf("i=%d, %d %d\n",i,LB[i],RB[i]); } RB[p->N] = q->N; */ /* printf("aligning %d %d\n",p->L,q->L); fflush(stdout); */ /* get region boundary lines */ /* lmost and rmost denote the leftmost and rightmost positions of the first alignment that should be aligned. */ LR(p->R - p->L, p->N, p->L, q->R - q->L, q->N, q->L, LB, RB, &lmost, &rmost); /* printf("LR done\n"); printf("lmost=%d, romst=%d, N=%d\n",lmost,rmost,p->N); printf("llb=%d, lrb=%d, rlb=%d, rrb=%d\n",LB[lmost],RB[lmost],LB[rmost],RB[rmost]); for (i=0; i<=p->N; ++i) printf("i = %d, %d %d\n",i,LB[i],RB[i]); fflush(stdout); */ c = YAMA2(p->L, p->R - p->L, p->N, q->L, q->R - q->L, q->N, ALS, &SL, &D_NO, &I_NO, &R_NO); /* printf("yama2 done\n"); printf("SL= %d\n",SL); printf("optimal score= %d\n",c); fflush(stdout); */ ALS2script(ALS, D_NO, I_NO, R_NO, p->L, p->R, q->L, q->R); n = tp->R - tp->L + 1; tp->N = SL; /* tp->AL = (char **)ckalloc((SL+1)*sizeof(char *)); tp->AL[0] = (char *)ckalloc(n*(SL+1)); for (i=1; i<=SL; ++i) tp->AL[i] = tp->AL[i-1] + n; MALStoMAL(p->AL,p->R - p->L, p->N, q->AL, q->R - q->L, q->N, ALS, SL, tp->AL); script2AL(tp->L, tp->R-tp->L, tp->AL); free(p->AL[0]); free(p->AL); free(q->AL[0]); free(q->AL); */ free(ALS); free(LB); free(RB); return c+c1+c2; } /* script_init - initialize sequence scripts */ int script_init() { int i; char *ckalloc(); for (i=1; i<=K; ++i) { Seq_Script[i] = (int *) ckalloc(sizeof(int)); Seq_Script[i][0] = L[i]; Entry_No[i] = 1; } } /* script2AL - sequence scripts to alignment array */ int script2AL(int l, int m, char ***al, int n) { int i, ci, si, j, k, i1; int c, *sp; char *sq; char **AL, *ckalloc(); /* fprintf(stderr, "Script2AL: l=%d, m=%d\n",l,m); */ /* allocate space for alignment array */ AL = *al = (char **)ckalloc((n+1)*sizeof(char *)); AL[0] = (char *)ckalloc((m+1)*(n+1)); for (i=1; i<=n; ++i) AL[i] = AL[i-1] + (m+1); /* translate sequence scripts to an alignment array */ for (j=0; j<=m; ++j) { AL[0][j] = DASH; ci = 1; si = 1; sp = Seq_Script[l+j]; sq = S[l+j]; for (k=0; k 0) { /* non-gap */ for (i=ci, i1=si; i 0) { /* insertion */ /* printf("op=%d, val=%d\n",op,val); fflush(stdout); */ if (i == OP_NO - 1) { /* last entry */ if (tp[tpi-1] > 0 ) tp[tpi++] = -op; else tp[tpi-1] = tp[tpi-1]-op; } else { if (sp[spi] > 0) { if (val == sp[spi]) { if (spi == 0) tp[tpi++] = -op; else tp[tpi-1] = tp[tpi-1] - op; } else { /* split */ tp[tpi++] = sp[spi] - val; tp[tpi++] = -op; sp[spi] = val; } } else { sp[spi] = sp[spi] - op; } } } else { /* deletion */ /* printf("op=%d, val=%d\n",op,val); fflush(stdout); */ t = -op; while (t > 0) { /* printf("t=%d, val=%d\n",t,val); fflush(stdout); */ if (t >= val) { t = t - val; tp[tpi++] = sp[spi++]; if (spi < en ) val = abs(sp[spi]); } else { val = val - t; t = 0; } } } } /* for (t=0; t 0 ) tp[tpi++] = op; else tp[tpi-1] = tp[tpi-1]+op; } else { if (sp[spi] > 0) { if (val == sp[spi]) { if (spi == 0) tp[tpi++] = op; else tp[tpi-1] = tp[tpi-1] + op; } else { /* split */ tp[tpi++] = sp[spi] - val; tp[tpi++] = op; sp[spi] = val; } } else { sp[spi] = sp[spi] + op; } } } else { /* insertion */ /* printf("op=%d, val=%d\n",op,val); fflush(stdout); */ t = op; while (t > 0) { if (t >= val) { t = t - val; tp[tpi++] = sp[spi++]; if (spi < en ) val = abs(sp[spi]); } else { val = val - t; t = 0; } } } } /* for (t=0; t 0) { col[i] = Encode[S[i1][SI[i]]]; } else { col[i] = DASH; } } } /* script2ei - Script to an end gap indicator */ int script2ei(int *EN, int *SI, char *ei, int num, int l) { int i, i1; for (i=0; i<=num; ++i) { i1 = l+i; if (SI[i] == 0) ei[i] = LSTAR; /* Column 0 */ else if (Seq_Script[i1][EN[i]] > 0) { ei[i] = DASH; } else { if (SI[i] == 1) ei[i] = LSTAR; else if (SI[i] > L[i1]) ei[i] = RSTAR; else ei[i] = DASH; } } } /* script2hei - Script to an end gap indicator (for horizontal move) */ int script2hei(int *EN, int *SI, char *hei, int num, int l) { int i, i1; for (i=0; i<=num; ++i) { i1 = l+i; if (SI[i] == 0) hei[i] = LSTAR; else if (Seq_Script[i1][EN[i]] > 0) { if (SI[i] == L[i1]) hei[i] = RSTAR; else hei[i] = DASH; } else { if (SI[i] == 1) hei[i] = LSTAR; else if (SI[i] > L[i1]) hei[i] = RSTAR; else hei[i] = DASH; } } } /* forward_script - move script one more column */ int forward_script(int *EN, int *LO, int *SI, int num, int l) { int i, i1; for (i=0; i<=num; ++i) { i1 = l+i; if ((Seq_Script[i1][EN[i]] > 0) || (SI[i] == 0)) { ++SI[i]; } if (--LO[i] <= 0) { /* no leftover */ ++EN[i]; LO[i] = abs(Seq_Script[i1][EN[i]]); } } } /* flocate_entry - given a column index, locate the forward script entry */ int flocate_entry(int *EN, int *LO, int *SI, int num, int l, int col_index, int n) { int i, i1; int en, value, lo, si, ci, t; for (i=0; i<=num; ++i) { i1 = l+i; if (col_index == 0) { /* column 0 */ SI[i] = 0; EN[i] = 0; LO[i] = abs(Seq_Script[i1][0])+1; } else { ci = 0; si = 0; en = 0; value = Seq_Script[i1][en]; lo = abs(value); while (ci+lo < col_index) { if (value > 0) si = si+value; ci = ci+lo; value = Seq_Script[i1][++en]; lo = abs(value); } t = col_index-ci; lo = lo-t+1; if (value > 0) si = si+t; else si = si+1; SI[i] = si; EN[i] = en; LO[i] = lo; } /* fprintf(stderr,"i=%d, SI[i]=%d\n",i,SI[i]); fprintf(stderr,"si=%d, en=%d, lo=%d\n",si,en,lo); */ } } /* backward_script - move script one more column backward */ int backward_script(int *EN, int *LO, int *SI, int num, int l) { int i, i1; for (i=0; i<=num; ++i) { i1 = l+i; if (--LO[i] <= 0) { /* no leftover */ /* printf("en=%d\n",EN[i]); fflush(stdout); */ if (EN[i]>0) { --EN[i]; if (Seq_Script[i1][EN[i]] > 0) { --SI[i]; } } else { SI[i] = 0; /* printf("EN=0, i=%d, num=%d\n",i,num); */ } LO[i] = abs(Seq_Script[i1][EN[i]]); } else { if (Seq_Script[i1][EN[i]] > 0) { --SI[i]; } } } } /* locate_entry - given a column index, locate the script entry */ int locate_entry(int *EN, int *LO, int *SI, int num, int l, int col_index, int n) { int i, i1; int en, value, lo, si, ci, t; for (i=0; i<=num; ++i) { i1 = l+i; if (col_index == 0) { /* column 0 */ SI[i] = 0; EN[i] = 0; LO[i] = 0; } else { ci = n; si = L[i1]+1; en = Entry_No[i1]-1; value = Seq_Script[i1][en]; lo = abs(value); while (ci-lo >= col_index) { if (value > 0) si = si-value; ci = ci-lo; value = Seq_Script[i1][--en]; lo = abs(value); } t = ci - col_index; lo = lo - t; if (value > 0) si = si-t-1; SI[i] = si; EN[i] = en; LO[i] = lo; } /* printf("i=%d, SI[i]=%d\n",i,SI[i]); printf("si=%d, en=%d, lo=%d\n",si,en,lo); fflush(stdout); */ } } #define COST \ { DAG(DASH) = 0; \ DAG(SYMA) = m; \ DAG(SYMC) = m; \ DAG(SYMG) = m; \ DAG(SYMT) = m; \ SUB(DASH,SYMA) = -GE; \ SUB(DASH,SYMC) = -GE; \ SUB(DASH,SYMG) = -GE; \ SUB(DASH,SYMT) = -GE; \ SUB(SYMA,SYMC) = v; \ SUB(SYMA,SYMG) = v; \ SUB(SYMA,SYMT) = v; \ SUB(SYMC,SYMG) = v; \ SUB(SYMC,SYMT) = v; \ SUB(SYMG,SYMT) = v; \ \ DT([0][0][0][0]) = 0; /* -- : -- */ \ DT([0][0][0][1]) = G; /* -- : -x */ \ DT([0][0][1][0]) = G; /* -x : -- */ \ DT([0][0][1][1]) = 0; /* -x : -x */ \ DT([0][1][0][0]) = 0; /* -- : x- */ \ DT([0][1][0][1]) = 0; /* -- : xx */ \ DT([0][1][1][0]) = G; /* -x : x- */ \ DT([0][1][1][1]) = 0; /* -x : xx */ \ DT([1][0][0][0]) = 0; /* x- : -- */ \ DT([1][0][0][1]) = G; /* x- : -x */ \ DT([1][0][1][0]) = 0; /* xx : -- */ \ DT([1][0][1][1]) = 0; /* xx : -x */ \ DT([1][1][0][0]) = 0; /* x- : x- */ \ DT([1][1][0][1]) = G; /* x- : xx */ \ DT([1][1][1][0]) = G; /* xx : x- */ \ DT([1][1][1][1]) = 0; /* xx : xx */ \ \ \ /* Left end gaps */ \ DAG(LSTAR) = 0; \ SUBI(LSTAR,SYMA) = -LGGEI; \ SUBI(LSTAR,SYMC) = -LGGEI; \ SUBI(LSTAR,SYMG) = -LGGEI; \ SUBI(LSTAR,SYMT) = -LGGEI; \ SUBJ(LSTAR,SYMA) = -LGGEJ; \ SUBJ(LSTAR,SYMC) = -LGGEJ; \ SUBJ(LSTAR,SYMG) = -LGGEJ; \ SUBJ(LSTAR,SYMT) = -LGGEJ; \ SUB(LSTAR,DASH) = 0; \ \ DT([0][0][0][2]) = 0; /* -- : -* */ \ DTI([0][0][1][2]) = LGGJ; /* -x : -* */ \ DTJ([0][0][1][2]) = LGGI; /* -x : -* */ \ DT([0][0][2][0]) = 0; /* -* : -- */ \ DTI([0][0][2][1]) = LGGI; /* -* : -x */ \ DTJ([0][0][2][1]) = LGGJ; /* -* : -x */ \ DT([0][0][2][2]) = 0; /* -* : -* */ \ DT([0][1][0][2]) = 0; /* -- : x* */ \ DTI([0][1][1][2]) = LGGJ; /* -x : x* */ \ DTJ([0][1][1][2]) = LGGI; /* -x : x* */ \ DT([0][1][2][0]) = 0; /* -* : x- */ \ DT([0][1][2][1]) = 0; /* -* : xx */ \ DT([0][1][2][2]) = 0; /* -* : x* */ \ DT([1][0][0][2]) = 0; /* x- : -* */ \ DT([1][0][1][2]) = 0; /* xx : -* */ \ DT([1][0][2][0]) = 0; /* x* : -- */ \ DTI([1][0][2][1]) = LGGI; /* x* : -x */ \ DTJ([1][0][2][1]) = LGGJ; /* x* : -x */ \ DT([1][0][2][2]) = 0; /* x* : -* */ \ DT([1][1][0][2]) = 0; /* x- : x* */ \ DTI([1][1][1][2]) = LGGJ; /* xx : x* */ \ DTJ([1][1][1][2]) = LGGI; /* xx : x* */ \ DT([1][1][2][0]) = 0; /* x* : x- */ \ DTI([1][1][2][1]) = LGGI; /* x* : xx */ \ DTJ([1][1][2][1]) = LGGJ; /* x* : xx */ \ DT([1][1][2][2]) = 0; /* x* : x* */ \ \ /* Right end gaps */ \ DAG(RSTAR) = 0; \ SUBI(RSTAR,SYMA) = -RGGEI; \ SUBI(RSTAR,SYMC) = -RGGEI; \ SUBI(RSTAR,SYMG) = -RGGEI; \ SUBI(RSTAR,SYMT) = -RGGEI; \ SUBJ(RSTAR,SYMA) = -RGGEJ; \ SUBJ(RSTAR,SYMC) = -RGGEJ; \ SUBJ(RSTAR,SYMG) = -RGGEJ; \ SUBJ(RSTAR,SYMT) = -RGGEJ; \ SUB(RSTAR,DASH) = 0; \ \ DT([0][0][0][3]) = 0; /* -- : -* */ \ DTI([0][0][1][3]) = RGGJ; /* -x : -* */ \ DTJ([0][0][1][3]) = RGGI; /* -x : -* */ \ DT([0][0][3][0]) = 0; /* -* : -- */ \ DTI([0][0][3][1]) = RGGI; /* -* : -x */ \ DTJ([0][0][3][1]) = RGGJ; /* -* : -x */ \ DT([0][0][3][3]) = 0; /* -* : -* */ \ DT([0][1][0][3]) = 0; /* -- : x* */ \ DTI([0][1][1][3]) = RGGJ; /* -x : x* */ \ DTJ([0][1][1][3]) = RGGI; /* -x : x* */ \ DT([0][1][3][0]) = 0; /* -* : x- */ \ DT([0][1][3][1]) = 0; /* -* : xx */ \ DT([0][1][3][3]) = 0; /* -* : x* */ \ DT([1][0][0][3]) = 0; /* x- : -* */ \ DT([1][0][1][3]) = 0; /* xx : -* */ \ DT([1][0][3][0]) = 0; /* x* : -- */ \ DTI([1][0][3][1]) = RGGI; /* x* : -x */ \ DTJ([1][0][3][1]) = RGGJ; /* x* : -x */ \ DT([1][0][3][3]) = 0; /* x* : -* */ \ DT([1][1][0][3]) = 0; /* x- : x* */ \ DTI([1][1][1][3]) = RGGJ; /* xx : x* */ \ DTJ([1][1][1][3]) = RGGI; /* xx : x* */ \ DT([1][1][3][0]) = 0; /* x* : x- */ \ DTI([1][1][3][1]) = RGGI; /* x* : xx */ \ DTJ([1][1][3][1]) = RGGJ; /* x* : xx */ \ DT([1][1][3][3]) = 0; /* x* : x* */ \ \ SUB(LSTAR,RSTAR) = 0; \ DT([0][0][2][3]) = 0; /* -* : -@ */ \ DT([0][0][3][2]) = 0; \ DT([0][1][2][3]) = 0; \ DT([0][1][3][2]) = 0; \ DT([1][0][2][3]) = 0; \ DT([1][0][3][2]) = 0; \ DT([1][1][2][3]) = 0; \ DT([1][1][3][2]) = 0; \ } /* data : read input scoring scheme */ void data () { auto int i, j; auto int m, v; auto int G, GE; /* gap cost */ auto int LGGI, LGGJ, LGGEI, LGGEJ; /* end gap cost */ auto int RGGI, RGGJ, RGGEI, RGGEJ; /* end gap cost */ auto char first_time; for (i=0; i 0) { ++j; ++k; alk = AL[k]; for (I=0; I<=M1; ++I) alk[I]=DASH; alt = AL2[j]; for (I=0; I<=M2; ++I) alk[M1+1+I]=alt[I]; op--; } else { ++i; ++k; alk = AL[k]; alt = AL1[i]; for (I=0; I<=M1; ++I) alk[I]=alt[I]; for (I=0; I<=M2; ++I) alk[M1+1+I]=DASH; op++; } } } } /* MDisplay - display a multiple alignment */ void MDisplay(char **AL, int M, int N) { int i,j,k; int POS[NUMBER]; for (k=0; k<=M; ++k) POS[k] = From[k+1]; for (i=1; i<=N; i=i+LL) { printf(" "); for (j=i+1; j<=N+1 && j=1 && AL2[j][i]==DASH) { VE2[j][i] = E2[j][i] = RSTAR; --j; } VE2[j][i] = RSTAR; } /* Compute projection cost within AL1 */ /* Compute C1[lmost+1] ... C1[rmost] */ /* c1 = 0; flocate_entry(EN,LO,SI,M1,L1,lmost,N1); script2col(EN,SI,pal1,M1,L1); forward_script(EN,LO,SI,M1,L1); for (i=lmost+1; i<=rmost; ++i) { t = 0; script2col(EN,SI,cal1,M1,L1); script2ei(EN,SI,e1, M1,L1); for (I=0; I 0) { /* starting with some deletions */ DEL(lmost); init_node = VNODE; } else { init_node = XNODE; } end_node = XNODE; LB[lmost] = 0; /* Make sure that it's normalized. */ RB[rmost] = N2; /* Compute an optimal path from init_node at (lmost,0) to end_node at (rmost,N2) */ c = yama2(lmost,0,init_node,rmost,N2,end_node); if (rmost < N1) DEL(N1-rmost); /* finished with some deletions */ /* fprintf(stderr,"c=%d, c1=%d, c2=%d\n",c,c1,c2); */ *SL = sl; *D_NO = d_no; *I_NO = i_no; *R_NO = r_no; /* free(C1+lmost); free(C2); */ free(RH); free(RV); free(RD); free(E2[0]); free(E2); free(VE2[0]); free(VE2); free(HP); free(VP); free(DP); free(HN); free(VN); free(DN); free(PP[0]+lmost); free(PP[1]+lmost); free(PP[2]+lmost); free(PN[0]+lmost); free(PN[1]+lmost); free(PN[2]+lmost); free(RI+lmost); if (AL1) { free(AL1[0]); free(AL1); } if (AL2) { free(AL2[0]); free(AL2); } free(EN); free(SI); free(LO); free(pal1); free(cal1); free(e1); free(he1); return c; } /* yama2 - return an optimal alignment of two alignments (within a region) */ int yama2(int I1, int J1, char Node1, int I2, int J2, char Node2) { int i, j, I, J; int h, v, d, t, c; int hg, vg, dg; char *cal2, *pal2; char ci, cj, pi, pj, ei, ej; int midi, midj; int midc, score; char *e2, *ve2; int lb, rb; int i1, j1, i2, j2, b, b1, b2; int cmid, nmid, pmid; char flag, hflag, vflag, dflag; int hp, vp, dp, tp; char hn, vn, dn, tn, n1, n2; int l_save, r_save; /* printf("I1=%d, J1=%d, Node1=%d, I2=%d, J2=%d, Node2=%d\n",I1,J1,Node1,I2,J2,Node2); printf("i=%d, %d %d\n",i,LB[i],RB[i]); for (i=I1; i<=I2; ++i) if (LB[i]>RB[i]) { printf("Boundary error.\n"); printf("i=%d, %d %d\n",i,LB[i],RB[i]); fflush(stdout); exit(0); } fflush(stdout); */ /* Backward computation */ /* Initialize the last row */ if (Node2 == XNODE) h = v = d = RH[J2] = RV[J2] = RD[J2] = 0; else { h = v = d = RH[J2] = RV[J2] = RD[J2] = MININT; if (Node2 == HNODE) h = RH[J2] = 0; else if (Node2 == VNODE) RV[J2] = 0; else RD[J2] = 0; } RI[I2+J2] = I2; cmid = J2; if (I1 < I2) nmid = (LB[I2-1] + RB[I2-1] + 1)/2; else nmid = J1; /* initialize script pointers */ locate_entry(EN,LO,SI,M1,L1,I2,N1); script2col(EN,SI,cal1,M1,L1); script2hei(EN,SI,he1,M1,L1); /* he1 = HE1[I2]; */ cal2 = AL2[J2]; for (j=J2-1; j>=LB[I2]; --j) { pal2 = AL2[j]; /* c = C2[j+1]; */ c = 0; e2 = E2[j+1]; for (I=0; I<=M2; ++I) { cj = ET(cal2[I],e2[I]); for (J=0; J<=M1; ++J) c += PD(StoA[L1+J],StoA[L2+I],he1[J],cj); } d = v = h += c; /* for (I=0; I= nmid) { /* (I2,j+1) is a partition point */ HP[j] = VP[j] = DP[j] = I2+j+1; HN[j] = VN[j] = DN[j] = HNODE; } else { HP[j] = VP[j] = DP[j] = HP[j+1]; HN[j] = VN[j] = DN[j] = HN[j+1]; } if (j >= nmid) { /* (I2,j) is a partition point */ b = I2+j; RI[b] = I2; PP[DNODE][b] = PP[VNODE][b] = PP[HNODE][b] = b+1; PN[DNODE][b] = PN[VNODE][b] = PN[HNODE][b] = HNODE; } /* printf("i=%d, j=%d, d= %d, v=%d, h=%d\n",I2,j,d,v,h); */ } for (j=LB[I2]-1; j>=J1; --j) { RH[j] = h+MININT; RD[j] = d+MININT; RV[j] = v+MININT; } /* Compute dynamic programming matrix */ for (i=I2-1; i>=I1; --i) { /* The order of the procedure calls is important. */ script2ei(EN,SI,e1,M1,L1); backward_script(EN,LO,SI,M1,L1); script2col(EN,SI,pal1,M1,L1); script2hei(EN,SI,he1,M1,L1); /* e1 = E1[i+1]; pal1 = AL1[i]; for (I=0; I<=M1; ++I) if (pal1[I] != tt[I]) { printf("L1=%d, M1=%d, i=%d, I=%d\n",L1,M1,i,I); printf("SI=%d, LO=%d, EN=%d, Seq_Script=%d %d\n",SI[I], LO[I], EN[I], Seq_Script[L1+I][EN[I]+1], Seq_Script[L1+I][EN[I]]); printf("pal1=%c, tt=%c\n",pal1[I], tt[I]); printf("AL1[i][0]=%c, AL1[i+1][0]=%c\n",AL1[i][0],AL1[i+1][0]); } he1 = HE1[i]; */ pal2 = AL2[RB[i]]; lb = LB[i]; rb = RB[i]; pmid = cmid; cmid = nmid; if (i > I1) nmid = (LB[i-1]+RB[i-1] + 1)/2; else nmid = J1; /* Initialize the last column */ /* c = C1[i+1]; */ c = 0; ve2 = VE2[rb]; for (I=0; I<=M1; ++I) { ci = ET(cal1[I],e1[I]); for (J=0; J<=M2; ++J) c += PD(StoA[L1+I],StoA[L2+J],ci,ve2[J]); } d = h = v = RV[rb] + c; /* for (I=0; I=nmid && rb<=cmid) || (rb>cmid && rb<=min(LB[i+1], RB[i]))) { flag = TRUE; b = i+rb; RI[b] = i; } else flag = FALSE; if ((rb>=cmid && rb<=pmid) || (i+1pmid && rb<=min(LB[i+2], RB[i+1]))) { vflag = TRUE; } else vflag = FALSE; if (vflag) { /* (i+1, rb) is a partition point */ hp = vp = dp = i+rb+1; hn = vn = dn = VNODE; } else { hp = vp = dp = VP[rb]; hn = vn = dn = VN[rb]; } if (rb < RB[i+1]) { /* diagonal edge should be considered. */ cal2 = AL2[rb+1]; pal2 = AL2[rb]; e2 = E2[rb+1]; /* Compute RD, RH, RV affected by the diagonal edge */ /* c = C1[i+1] + C2[rb+1]; */ c = 0; for (I=0; I<=M1; ++I) { ci = ET(cal1[I],e1[I]); for (J=0; J<=M2; ++J) { cj = ET(cal2[J],e2[J]); c += PD(StoA[L1+I],StoA[L2+J],ci,cj); } } hg = vg = dg = RD[rb+1]; for (I=0; I<=M1; ++I) { ETGV(ci,cal1[I],e1[I]) pi = pal1[I]; for (J=0; J<=M2; ++J) { ETGV(cj,cal2[J],e2[J]) pj = pal2[J]; hg -= PT[StoA[L1+I]][StoA[L2+J]][0][pj!=DASH][ci][cj]; vg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][0][ci][cj]; dg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][pj!=DASH][ci][cj]; } } /* for (I=0; I=cmid && rb+1<=pmid) || (i+1pmid && rb+1<=min(LB[i+2], RB[i+1]))) { dflag = TRUE; } else dflag = FALSE; if (dflag) { /* (i+1, rb+1) is a partition point. */ tp = i+rb+2; tn = DNODE; } else { tp = DP[rb+1]; tn = DN[rb+1]; } if (c+hg > h) { h = c+hg; hp = tp; hn = tn; } if (c+vg > v) { v = c+vg; vp = tp; vn = tn; } if (c+dg > d) { d = c+dg; dp = tp; dn = tn; } } if (flag) { /* save information in partition nodes */ PP[DNODE][b] = dp; PN[DNODE][b] = dn; PP[VNODE][b] = vp; PN[VNODE][b] = vn; PP[HNODE][b] = hp; PN[HNODE][b] = hn; } /* Compute the remaining columns */ cal2 = AL2[rb]; for (j=rb-1; j>=lb; --j) { pal2 = AL2[j]; e2 = E2[j+1]; ve2 = VE2[j]; hflag = flag; dflag = vflag; if ((j>=nmid && j<=cmid) || (j>cmid && j<=min(LB[i+1], RB[i]))) { flag = TRUE; b = i+j; RI[b] = i; } else flag = FALSE; if ((j>=cmid && j<=pmid) || (i+1pmid && j<=min(LB[i+2], RB[i+1]))) { vflag = TRUE; } else vflag = FALSE; /* Compute RD, RH, RV affected by the diagonal edge */ /* c = C1[i+1] + C2[j+1]; */ c = 0; for (I=0; I<=M1; ++I) { ci = ET(cal1[I],e1[I]); for (J=0; J<=M2; ++J) { cj = ET(cal2[J],e2[J]); c += PD(StoA[L1+I],StoA[L2+J],ci,cj); } } hg = vg = dg = RD[j+1]; for (I=0; I<=M1; ++I) { ETGV(ci,cal1[I],e1[I]) pi = pal1[I]; for (J=0; J<=M2; ++J) { ETGV(cj,cal2[J],e2[J]) pj = pal2[J]; hg -= PT[StoA[L1+I]][StoA[L2+J]][0][pj!=DASH][ci][cj]; vg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][0][ci][cj]; dg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][pj!=DASH][ci][cj]; } } /* for (I=0; I h) { h = c+hg; hp = tp; hn = tn; } if (c+vg > v) { v = c+vg; vp = tp; vn = tn; } if (c+dg > d) { d = c+dg; dp = tp; dn = tn; } /* Compute RD, RH, RV affected by the vertical edge */ /* c = C1[i+1]; */ c = 0; for (I=0; I<=M1; ++I) { ci = ET(cal1[I],e1[I]); for (J=0; J<=M2; ++J) c += PD(StoA[L1+I],StoA[L2+J],ci,ve2[J]); } hg = dg = vg = RV[j]; for (I=0; I<=M1; ++I) { /* ci = GV(cal1[I]); */ ETGV(ci,cal1[I],e1[I]) pi = pal1[I]; for (J=0; J<=M2; ++J) { ej = GV(ve2[J]); pj = pal2[J]; hg -= PT[StoA[L1+I]][StoA[L2+J]][0][pj!=DASH][ci][ej]; vg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][0][ci][ej]; dg -= PT[StoA[L1+I]][StoA[L2+J]][pi!=DASH][pj!=DASH][ci][ej]; } } /* for (I=0; I h) { h = c+hg; hp = tp; hn = tn; } if (c+vg > v) { v = c+vg; vp = tp; vn = tn; } if (c+dg > d) { d = c+dg; dp = tp; dn = tn; } if (flag) { /* save information in partition nodes */ PP[DNODE][b] = dp; PN[DNODE][b] = dn; PP[VNODE][b] = vp; PN[VNODE][b] = vn; PP[HNODE][b] = hp; PN[HNODE][b] = hn; } /* printf("i=%d, j=%d, d=%d, h=%d, v=%d\n",i,j,d,h,v); */ cal2 = pal2; } tal1 = cal1; cal1 = pal1; pal1 = tal1; RH[lb] = h; RV[lb] = v; RD[lb] = d; HP[lb] = hp; VP[lb] = vp; DP[lb] = dp; HN[lb] = hn; VN[lb] = vn; DN[lb] = dn; } b = I2 + J2; i1 = I1; j1 = J1; b1 = I1 + J1; if (Node1 == XNODE) { if (d>=h && d>=v) n1 = DNODE; else if (h>=v) n1 = HNODE; else n1 = VNODE; } else n1 = Node1; while (b1 != b) { /* printf("b=%d, b1=%d\n",b,b1); fflush(stdout); */ b2 = PP[n1][b1]; i2 = RI[b2]; j2 = b2 - i2; n2 = PN[n1][b1]; /* printf("i1=%d, j1=%d, n1=%d, i2=%d, j2=%d, n2=%d\n",i1,j1,n1,i2,j2,n2); fflush(stdout); */ if (i1 == i2) { if (j2 > j1) INS(1) } else if (j1 == j2) DEL(1) else if (i1+1 == i2 && j1+1 == j2) { if (n2 == VNODE) { INS(1) DEL(1)} else if (n2 == HNODE) { DEL(1) INS(1) } else REP } else { l_save = LB[i2]; r_save = RB[i2]; if (j1 < (LB[i1]+RB[i1] + 1)/2) { for (i = i2; i > i1; --i) { nmid = (LB[i-1]+RB[i-1] + 1)/2; LB[i] = max(j1, LB[i]); RB[i] = nmid-1; } LB[i1] = j1; RB[i1] = j1; RB[i2] = j2; } else { for (i = i1; i < i2; ++i) { cmid = (LB[i]+RB[i] + 1)/2; LB[i] = max(cmid,LB[i+1])+1; RB[i] = min(j2, RB[i]); } LB[i1] = j1; LB[i2] = j2; RB[i2] = j2; } yama2(i1,j1,n1,i2,j2,n2); LB[i2] = l_save; RB[i2] = r_save; } i1 = i2; j1 = j2; n1 = n2; b1 = b2; } if (Node1 == HNODE) return h; else if (Node1 == DNODE) return d; else if (Node1 == VNODE) return v; else return max3(h,d,v); } /* align_score - return the score of the alignment AL */ int align_score(int M, int N) { int I, J, i, j; int score; char *cal, *pal, *tal; char ci, cj, pi, ti, tj; char *e1; int *EN, *SI, *LO; char *ckalloc(); cal = ckalloc(M+1); pal = ckalloc(M+1); e1 = ckalloc(M+1); j = (M1+1)*sizeof(int); EN = (int *) ckalloc(j); SI = (int *) ckalloc(j); LO = (int *) ckalloc(j); for (I=0; I<=M; ++I) { EN[I] = 0; SI[I] = 1; LO[I] = abs(Seq_Script[1+I][0]); pal[I] = DASH; } score = 0; for (j=1; j<=N; ++j) { script2col(EN,SI,cal,M,1); script2ei(EN,SI,e1, M,1); for (I=0; I=1 && AL[j][i]==DASH) { E1[j][i] = RSTAR; --j; } } score = 0; pal = AL[0]; for (j=1; j<=N; ++j) { e1 = E1[j]; cal = AL[j]; for (I=0; Iy) if (x>z) return x; else return z; else if (y>z) return y; else return z; }