/* sample program for invoking sub.c & AC.c */ /* This software proceeds as follows: Step 1. invoke ALL_SUB() to construct the DAG for a given threshold score CUT_OFF; Step 2. Construct() is called to build up the pattern matching machine; Step 3. DAG_pat_match() uses the built matching machine to find all pattern occurrences in the DAG; Step 4. A backward chaining process back_chain() to link the occurrences along the path in the DAG; Step 5. output the path in the DAG with the maximum pattern score. Note: If more than one s-t paths in the DAG have the maximum pattern score, we output the one with the highest alignment score. Usage: pat_AC seq1 seq2 ms(<0) g(>=0) h(>0) cut_off pat_file */ #include #include "sub.h" #include "AC.h" #define CNODE 0 #define DNODE 1 #define ENODE 2 /* Chain stores all patterns starts in row i */ typedef struct CHAIN_NODE { int j; int pat_no; NODE *np; struct CHAIN_NODE *link; } CHAIN_NODE; CHAIN_NODE **Chain; char *In_pat; /* denote if the aligned pair in some pattern */ int *S; /* alignment script */ int *pbs; /* pattern backward aligned score */ static void DAG_pat_match(), back_chain(), sort_list(), output_path(); static void pattern_bs(); main(int argc, char *argv[]) { double atof(); char *ckalloc(); FILE *qp, *ckopen(); char *seq1, *seq2; char *name1, *name2; int from1, to1, from2, to2; int M, N; /* lengths of sequences */ long low, up, ms, G, H, W[128][128], CUT_OFF; int *L, *R; int i, j; int c; if (argc != 8) fatalf("%s seq1 seq2 ms(<0) g(>=0) h(>0) cut_off pat_file",argv[0]); M = get_seq(argv[1], &seq1, &name1, &from1, &to1); N = get_seq(argv[2], &seq2, &name2, &from2, &to2); ms = DIGIT * atof(argv[3]); if (ms > 0) fatal("The mismatch weight should be nonpositive."); G = DIGIT * atof(argv[4]); if (G < 0) fatal("The gap-open penalty should be nonnegative."); H = DIGIT * atof(argv[5]); if (H < 0) fatal("The gap-extend penalty should be positive."); CUT_OFF = DIGIT * atof(argv[6]); /* set up match and mismatch weights */ for (i = 0; i < 128; i++) for (j = 0; j < 128; j++) if (i == j) W[i][j] = DIGIT; else W[i][j] = ms; printf("%s[%d,%d]: length = %d\n",name1,from1,to1,M); printf("%s[%d,%d]: length = %d\n",name2,from2,to2,N); printf("mismatch = %.1f, gap_open = %.1f, gap_ext = %.1f, cut_off = %.1f\n\n",ms/DIGIT,G/DIGIT,H/DIGIT,CUT_OFF/DIGIT); /* compute the DAG for a given threshold score CUT_OFF */ c = ALL_SUB(seq1-1,seq2-1,M,N,W,G,H,CUT_OFF); if (c == -1) { printf("No alignment is of score >= cut_off\n"); exit(1); } /* Construct an automaton for pat_file */ Construct(argv[7]); pattern_bs(); /* pattern matching along the DAG */ Chain = (CHAIN_NODE **) ckalloc((M+1)*sizeof(CHAIN_NODE *)); for (i=0; i<=M; ++i) Chain[i] = NULL; DAG_pat_match(M); /* backward computation to compute the best pattern score */ back_chain(M); printf("pattern score = %d %d %d\n", F[0]->cs, F[0]->ds, F[0]->es); printf("pattern aligned score = %d %d %d\n", F[0]->cbs, F[0]->dbs, F[0]->ebs); S = sapp = (int *)ckalloc((M+N)*sizeof(int)); In_pat = (char *)ckalloc((M+1)*sizeof(char)); for (i=0; i<=M; ++i) In_pat[i] = FALSE; /* A forward traverse starts from (0,0) to find the path with the best pattern score */ output_path(M, N); /* compute the alignment score of the output path (double check) */ c = CHECK_SCORE(seq1-1,seq2-1,M,N,S); printf("\nCheck_score=%.1f\n",((float)c)/DIGIT); /* Display the alignment */ DISPLAY(seq1-1,seq2-1,M,N,S,from1,from2); LDISPLAY(name1,from1,to1,name2,from2,to2,S,c); exit(0); } /* pattern_bs - compute the aligned score for each pattern */ void pattern_bs() { int i, score; char *ckalloc(), *p; pbs = (int *) ckalloc(P*sizeof(int)); for (i=0; iclink) && (AA[i+1] == BB[np1->j])) { /* a possible state transition */ state = np->state; while (gf[state][Encode[AA[i+1]]] == FAIL) state = fs[state]; state = gf[state][Encode[AA[i+1]]]; /* save the state in grid point np1 */ np1->state = state; if (s_output[state]) { /* pattern found */ sp = s_output[state]; /* printf("In position %d %d:",i+1,np1->j); */ for (sp=s_output[state]; sp; sp=sp->link) { /* add the pattern to the chain */ /* printf(" %d", sp->pat_no); */ cp = (CHAIN_NODE *) ckalloc(sizeof(CHAIN_NODE)); cp->pat_no = sp->pat_no; cp->j = np1->j - pat_len[sp->pat_no]; cp->np = np1; ti = i + 1 - pat_len[sp->pat_no]; cp->link = Chain[ti]; Chain[ti] = cp; } /* printf("\n"); */ } } np = np->link; } } /* sort the Chain for each row except the last one */ for (i=0; ilink) { for (cp1=cp->link; cp1; cp1=cp1->link) { if (cp1->j > cp->j) { /* swap */ j = cp->j; pat_no = cp->pat_no; np = cp->np; cp->j = cp1->j; cp->pat_no = cp1->pat_no; cp->np = cp1->np; cp1->j = j; cp1->pat_no = pat_no; cp1->np = np; } } } } /* back_chain - a backward traverse of the DAG to chain up the patterns */ void back_chain(int M) { int i; NODE *np, *np1; CHAIN_NODE *cp; int cs, ds, es, t, bs, tbs; NODE *cnext, *dnext, *enext; /* backward computation to compute the best pattern chain score */ /* Tie breaking rule: if more than one paths result the maximum pattern score, we take the one with the highest alignment score */ /* Initialize the last row */ for (np=LastF[M]; np; np=np->blink) { np->cs = 0; np->ds = 0; np->es = 0; np->cnext = LastF[M]; np->dnext = LastF[M]; np->enext = LastF[M]; np->cbs = np->r; np->dbs = np->s; np->ebs = np->t; } /* backward computation */ for (i=M-1; i>=0; --i) { cp = Chain[i]; for (np=LastF[i]; np; np=np->blink) { /* Compute np->cs */ bs = MININT; cs = 0; np->pat_no = -1; if ((np1 = np->dlink) && (np->c + np1->s - m >= cut)) { /* edge from c node to d node */ if ((np1->ds > cs) || ((np1->ds == cs) && (np1->dbs - m >= bs))) { cs = np1->ds; cnext = np1; bs = np1->dbs - m; } } if ((np1 = np->elink) && (np->c + np1->t - m >= cut)) { /* edge from c node to e node */ if ((np1->es > cs) || ((np1->es == cs) && (np1->ebs - m >= bs))) { cs = np1->es; cnext = np1; bs = np1->ebs - m; } } if (np1 = np->clink) { /* substitution edge is in some suboptimal path */ if ((np1->cs > cs) || ((np1->cs == cs) && (np1->cbs + w[AA[i+1]][BB[np1->j]] >= bs))) { cs = np1->cs; cnext = np1; bs = np1->cbs + w[AA[i+1]][BB[np1->j]]; } } while (cp && cp->j == np->j) { /* possible jumps with patterns */ /* printf("position %d %d: %d\n",i,cp->j, cp->pat_no); */ t = pat_score[cp->pat_no] + cp->np->cs; if ((t > cs) || ((t == cs) && (cp->np->cbs + pbs[cp->pat_no] >= bs))) { cs = t; cnext = cp->np; np->pat_no = cp->pat_no; bs = cp->np->cbs + pbs[cp->pat_no]; } cp = cp->link; } np->cbs = bs; /* Compute np->ds */ if (np->d + np->r >= cut) { ds = cs; dnext = cnext; tbs = bs; } else { ds = 0; tbs = MININT; } if ((np1 = np->dlink) && (np->d + np1->s - h >= cut)) { /* edge from d node to d node */ if ((np1->ds > ds) || ((np1->ds == ds) && (np1->dbs - h > tbs))) { ds = np1->ds; dnext = np1; tbs = np1->dbs - h; } } np->dbs = tbs; /* Compute np->es */ if (np->e + np->r >= cut) { es = cs; enext = cnext; tbs = bs; } else { es = 0; tbs = MININT; } if ((np1 = np->elink) && (np->e + np1->t - h >= cut)) { /* edge from e node to e node */ if ((np1->es > es) || ((np1->es == es) && (np1->ebs - h > tbs))) { es = np1->es; enext = np1; tbs = np1->ebs - h; } } np->ebs = tbs; np->cs = cs; np->ds = ds; np->es = es; np->cnext = cnext; np->dnext = dnext; np->enext = enext; } } } /* output_path - output a path with the maximum pattern score */ void output_path(int M, int N) { NODE *np, *np1; char type; int t; /* start from c node at (0,0) */ np = F[0]; type = CNODE; while ((np->i < M) || (np->j < N)) { if (type == CNODE) np1 = np->cnext; else if (type == DNODE) np1 = np->dnext; else np1 = np->enext; /* printf("np: i=%d, j=%d\n",np->i,np->j); printf("np1: i=%d, j=%d\n",np1->i,np1->j); */ if (np->i == np1->i) { /* -> e node */ t = np1->j - np->j; /* printf("insert %d\n",t); */ INS(t) type = ENODE; } else if (np->j == np1->j) { /* -> d node */ t = np1->i - np->i; /* printf("delete %d\n",t); */ DEL(t) type = DNODE; } else { /* -> c node */ /* printf("replace %d\n",np1->i-np->i); */ for (t=np->i; ti; ++t) REP type = CNODE; if (np->pat_no >= 0) for (t=np->i+1; t<=np1->i; ++t) In_pat[t] = TRUE; } np = np1; } } /* Alignment display routines */ #define DLENGTH 60 static char ALINE[DLENGTH+1], BLINE[DLENGTH+1], CLINE[DLENGTH+1]; int DISPLAY(char A[], char B[], int M, int N, int *S, int AP, int BP) { register char *a, *b, *c; register int i, j, op; int lines, ap, bp; i = j = op = lines = 0; ap = AP; bp = BP; a = ALINE; b = BLINE; c = CLINE; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; *a = A[++i]; *b = B[++j]; /* *c++ = (*a++ == *b++) ? '|' : ' '; */ if (*a++ == *b++) { if (In_pat[i]) *c++ = '*'; else *c++ = '|'; } else *c++ = ' '; } else { if (op == 0) op = *S++; if (op > 0) { *a++ = ' '; *b++ = B[++j]; op--; } else { *a++ = A[++i]; *b++ = ' '; op++; } *c++ = '-'; } if (a >= ALINE+DLENGTH || i >= M && j >= N) { *a = *b = *c = '\0'; printf("\n%5d ",DLENGTH*lines++); for (b = ALINE+10; b <= a; b += 10) printf(" . :"); if (b <= a+5) printf(" ."); printf("\n%5d %s\n %s\n%5d %s\n",ap,ALINE,CLINE,bp,BLINE); ap = AP + i; bp = BP + j; a = ALINE; b = BLINE; c = CLINE; } } } /* LDISPLAY - display pairwise alignment in "lat" format */ int LDISPLAY(char *name1, int from1, int to1, char *name2, int from2, int to2, int *S, int c) { FILE *fp, *ckopen(); int i, j, op, count; fp = ckopen("lat.out","w"); fprintf(fp,"#:lav\n\n"); fprintf(fp,"d { \" \"\n}\n"); fprintf(fp,"s {\n \"%s\" %d %d\n \"%s\" %d %d\n}\n",name1,from1,to1,name2,from2,to2); fprintf(fp,"a {\n s %d\n b %d %d\n e %d %d\n",c,1,1,to1-from1+1,to2-from2+1); i = 1; j = 1; count = 0; while (i<=to1-from1+1 || j<=to2-from2+1) { op = *S++; if (op == 0) { ++i; ++j; ++count; } else { if (count > 0) { fprintf(fp," l %d %d %d %d %.1f\n",i-count,j-count,i-1,j-1,0.0); count = 0; } if (op > 0) { fprintf(fp," l %d %d %d %d %.1f\n",i,j,i-1,j+op-1,0.0); j += op; } else { fprintf(fp," l %d %d %d %d %.1f\n",i,j,i-op-1,j-1,0.0); i -= op; } } } if (count > 0) { /* output the last block */ fprintf(fp," l %d %d %d %d %.1f\n",i-count,j-count,i-1,j-1,0.0); count = 0; } fprintf(fp,"}\n"); fclose(fp); } /* CHECK_SCORE - return the score of the alignment stored in S */ int CHECK_SCORE(char A[], char B[], int M, int N, int S[]) { register long i, j, op; long score; score = i = j = op = 0; while (i < M || j < N) { op = *S++; /* printf("op=%d\n",op); */ if (op == 0) score = w[A[++i]][B[++j]] + score; else if (op > 0) { score = score - (g+op*h); j = j+op; } else { score = score - (g-op*h); i = i-op; } } return(score); }