/* A PACKAGE FOR SEQUENCE COMPARISON WITH AFFINE WEIGHTS AND UNIQUENESS BITS */ /* Globally passed params and macros */ #include #define DIGIT 10.0 #define MININT -999999 #define max(x,y) ((x) >= (y) ? (x) : (y)) static char *AA, *BB; static long (*w)[128]; /* w = W */ static long g, h, m; /* g = G, h = H, m = g+h */ #define gap(k) ((k) <= 0 ? 0 : g+h*(k)) /* k-symbol indel cost */ static int *sapp; /* Current script append ptr */ static int last; /* Last script op appended */ /* Append "Delete k" op */ #define DEL(k) \ { if (last < 0) \ last = sapp[-1] -= (k); \ else \ last = *sapp++ = -(k); \ } /* Append "Insert k" op */ #define INS(k) \ { if (last < 0) \ { sapp[-1] = (k); *sapp++ = last; } \ else \ last = *sapp++ = (k); \ } #define REP { last = *sapp++ = 0; } /* Append "Replace" op */ static long *CC, *DD; /* Forward cost-only vectors */ static long *RR, *SS; /* Reverse cost-only vectors */ static long *ICC, *IDD; /* Forward cost-only initial vectors */ static long *IRR, *ISS; /* Reverse cost-only initial vectors */ static long *VC, *VE; /* Forward cost-only vertical vectors */ static long *VR, *VT; /* Reverse cost-only vertical vectors */ static char *UC, *UR; /* Uniqueness bits vectors(col & row) */ static long opt; /* align(A,B,M,N,tb,te) returns the cost of an optimum conversion between A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero and appends such a conversion to the current script. */ static long align(A,B,M,N,tb,te) char *A, *B; int M, N; long tb, te; { int midi, midj, type; /* Midpoint, type, and cost */ long midc; long c1, c2; { register int i, j; register long c, e, d, s; long t, *wa; /* Boundary cases: M <= 1 or N == 0 */ if (N <= 0) { if (M > 0) DEL(M) return -gap(M); } if (M <= 1) { if (M <= 0) { INS(N); return -gap(N); } if (tb < te) tb = te; midc = (tb-h) - gap(N); midj = 0; wa = w[A[1]]; for (j = 1; j <= N; j++) { c = -gap(j-1) + wa[B[j]] - gap(N-j); if (c > midc) { midc = c; midj = j; } } if (midj == 0) { INS(N) DEL(1) } else { if (midj > 1) INS(midj-1) REP if (midj < N) INS(N-midj) } return midc; } /* Divide: Find optimum midpoint (midi,midj) of cost midc */ midi = M/2; /* Forward phase: */ CC[0] = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */ t = -g; for (j = 1; j <= N; j++) { CC[j] = t = t-h; DD[j] = t-g; } t = tb; for (i = 1; i <= midi; i++) { s = CC[0]; CC[0] = c = t = t-h; e = t-g; wa = w[A[i]]; for (j = 1; j <= N; j++) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[B[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } DD[0] = CC[0]; RR[N] = 0; /* Reverse phase: */ t = -g; /* Compute R(M/2,k) & S(M/2,k) for all k */ for (j = N-1; j >= 0; j--) { RR[j] = t = t-h; SS[j] = t-g; } t = te; for (i = M-1; i >= midi; i--) { s = RR[N]; RR[N] = c = t = t-h; e = t-g; wa = w[A[i+1]]; for (j = N-1; j >= 0; j--) { if ((c = c - m) > (e = e - h)) e = c; if ((c = RR[j] - m) > (d = SS[j] - h)) d = c; c = s + wa[B[j+1]]; if (e > c) c = e; if (d > c) c = d; s = RR[j]; RR[j] = c; SS[j] = d; } } SS[N] = RR[N]; midc = CC[0]+RR[0]; /* Find optimal midpoint */ midj = 0; type = 1; for (j = 0; j <= N; j++) if ((c = CC[j] + RR[j]) >= midc) if (c > midc || CC[j] != DD[j] && RR[j] == SS[j]) { midc = c; midj = j; } for (j = N; j >= 0; j--) if ((c = DD[j] + SS[j] + g) > midc) { midc = c; midj = j; type = 2; } } /* Conquer: recursively around midpoint */ if (type == 1) { c1 = align(A,B,midi,midj,tb,-g); c2 = align(A+midi,B+midj,M-midi,N-midj,-g,te); } else { align(A,B,midi-1,midj,tb,0); DEL(2); align(A+midi+1,B+midj,M-midi-1,N-midj,0,te); } return midc; } /* unibits - compute uniqueness bits */ long unibits(I1,J1,I2,J2) int I1, J1, I2, J2; { int midi, midi1, midj; /* Midpoint and cost */ long tc, td, tr, ts; long opt_num; char type; { register int i, j, M, N, midj1; register long c, e, d, s; long t, *wa; char b; /* printf("I1=%d, J1=%d, I2=%d, J2=%d\n",I1,J1,I2,J2); fflush(stdout); */ M = I2 - I1; N = J2 - J1; /* boundary cases: M <= 1 or N == 0 */ if (N <= 0) { for (i = I1+1; i <= I2; ++i) UR[i] = 1; return; } if (M <= 1) { if (M <= 0) { return; } opt_num = 0; midj1 = -1; if ((IDD[J1]+ISS[J1]-h == opt) || (ICC[J1]+ISS[J1]-m == opt)) { ++opt_num; midj1 = midj = J1; } wa = w[AA[I2]]; for (j = J1+1; j <= J2; ++j) { if (ICC[j-1]+IRR[j]+wa[BB[j]] == opt) { ++opt_num; if (midj1 == -1) midj1 = j-1; midj = j; } if ((IDD[j]+ISS[j]-h == opt) || (ICC[j]+ISS[j]-m == opt)) { ++opt_num; if (midj1 == -1) midj1 = j; midj = j; } } if (opt_num <= 1) UR[I2] = 1; for (j = midj1+1; j <= midj; ++j) UC[j] = 0; return; } /* Initialize CC, DD, RR, SS */ for (j = J1; j <= J2; ++j) { CC[j] = ICC[j]; DD[j] = IDD[j]; RR[j] = IRR[j]; SS[j] = ISS[j]; } /* Divide: Find optimum midpoint (midi,midj) of cost midc */ midi = (I1+I2)/2; /* Phase 1 : horozontal computation */ /* Find the rightmost crossing edge */ /* Forward computation */ t = max(CC[J1]-g,DD[J1]); for (i = I1+1; i <= midi; ++i) { s = CC[J1]; CC[J1] = c = t = t - h; e = t-g; wa = w[AA[i]]; for (j = J1+1; j <= J2; ++j) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[BB[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } DD[J1] = CC[J1]; /* Backward computation */ t = SS[J2]; for ( i = I2-1; i > midi; --i) { s = RR[J2]; t = t-h; RR[J2] = e = t-g; wa = w[AA[i+1]]; for (j = J2-1; j >= J1; --j) { c = s + wa[BB[j+1]]; d = SS[j]-h; if (e-m > c) c = e-m; if (d-g > c) c = d-g; if (c > d) d = c; if (c > (e = e-h)) e = c; s = RR[j]; RR[j] = c; SS[j] = d; } } SS[J2] = t; /* Determine the rightmost crossing point and # of crossing edges at row midi */ opt_num = 0; midj1 = -1; if ((DD[J1]+SS[J1]-h == opt) || (CC[J1]+SS[J1]-m == opt)) { ++opt_num; midj1 = midj = J1; type = 2; } wa = w[AA[midi+1]]; for (j = J1+1; j <= J2; ++j) { if (CC[j-1]+RR[j]+wa[BB[j]] == opt) { ++opt_num; midj = j; type = 1; if (midj1 == -1) midj1 = j-1; } if ((DD[j]+SS[j]-h == opt) || (CC[j]+SS[j]-m == opt)) { ++opt_num; midj = j; type = 2; if (midj1 == -1) midj1 = j; } } /* Uniqueness bits for these columns' insertion edges are 0 */ for (j = midj1+1; j <= midj; ++j) UC[j] = 0; /* End of phase 1 */ /* Phase 2 : vertical computation */ /* Determine the lowest crossing point at column midj */ midi1 = midi + 1; if (opt_num <= 1) UR[midi1] = 1; else { /* Rightward computation */ t = max(CC[J1]-g,DD[J1]); for (i = midi+1; i <= I2; ++i) { VC[i] = t = t-h; VE[i] = t-g; } for (j = J1+1; j < midj; ++j) { b = BB[j]; c = CC[j]; d = DD[j]; s = CC[j-1]; for (i = midi+1; i <= I2; ++i) { if ((c = c - m) > (d = d - h)) d = c; if ((c = VC[i] - m) > (e = VE[i] - h)) e = c; c = s + w[AA[i]][b]; if (d > c) c = d; if (e > c) c = e; s = VC[i]; VC[i] = c; VE[i] = e; } } /* Leftward computation */ t = ISS[J2]; for (i = I2-1; i > midi; --i) { t = t-h; VR[i] = VT[i] = t-g; } for (j = J2-1; j >= midj; --j) { b = BB[j+1]; d = ISS[j]; s = IRR[j+1]; for (i = I2-1; i > midi; --i) { c = s + w[AA[i+1]][b]; e = VT[i]-h; if (d-m > c) c = d-m; if (e-g > c) c = e-g; if (c > (d = d-h)) d = c; if (c > e) e = c; s = VR[i]; VR[i] = c; VT[i] = e; } } VR[I2] = IRR[midj]; VT[I2] = max(IRR[midj],IRR[J2]-h*(J2-midj)); /* printf("IRR[midj]=%d, IRR[J2]-h*(J2-midj)=%d\n",IRR[midj],IRR[J2]-h*(J2-midj)); printf("VC[I2-1]=%d, VC[I2]=%d, VE[I2]=%d, VR[I2]=%d, VT[I2]=%d\n",VC[I2-1],VC[I2],VE[I2],VR[I2],VT[I2]); */ /* Determine the lowest crossing point at column midj: part 1 */ b = BB[midj]; for (i = midi1 + 1; i <= I2; ++i) { if (VC[i-1]+VR[i]+w[AA[i]][b] == opt) midi1 = i; if ((VE[i]+VT[i]-h == opt) || (VC[i]+VT[i]-m == opt)) midi1 = i; } tc = CC[midj-1]; /* Determine the lowest crossing point at column midj: part 2 */ if (midi1 < I2) { t = max(CC[J1]-g,DD[J1]); for (i = midi+1; i < I2; ++i) { s = CC[J1]; CC[J1] = c = t = t - h; e = t-g; wa = w[AA[i]]; for (j = J1+1; j < midj; ++j) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[BB[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } DD[J1] = CC[J1]; if ((DD[J1]+ISS[J1]-h == opt) || (CC[J1]+ISS[J1]-m == opt)) { ++opt_num; midi1 = I2; } wa = w[AA[I2]]; for (j = J1+1; j < midj; ++j) { if (CC[j-1]+IRR[j]+wa[BB[j]] == opt) { ++opt_num; midi1 = I2; } if ((DD[j]+ISS[j]-h == opt) || (CC[j]+ISS[j]-m == opt)) { ++opt_num; midi1 = I2; } } } } /* End of phase 2 */ /* Setup initial values for subproblems */ /* Subproblem 1 */ s = RR[J2]; SS[J2] = t = SS[J2]-h; RR[J2] = e = t-g; wa = w[AA[midi+1]]; for (j = J2-1; j >= J1; --j) { c = s + wa[BB[j+1]]; d = SS[j]-h; if (e-m > c) c = e-m; if (d-g > c) c = d-g; if (c > d) d = c; if (c > (e = e-h)) e = c; s = RR[j]; RR[j] = c; SS[j] = d; } /* Phase 3 : compute the rightmost crossing edge above subproblem 2 */ if (midj < J2) { if (opt_num > 1) { VC[midi] = tc; /* Forward pass */ for (i = midi+1; i < midi1; ++i) { s = VC[i-1]; c = VC[i]; e = VE[i]; wa = w[AA[i]]; for (j = midj; j <= J2; ++j) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[BB[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } /* Backward pass */ for (j = midj+1; j <= J2; ++j) { RR[j] = IRR[j]; SS[j] = ISS[j]; } t = SS[J2]; for ( i = I2-1; i >= midi1; --i) { s = RR[J2]; t = t-h; RR[J2] = e = t-g; wa = w[AA[i+1]]; for (j = J2-1; j > midj; --j) { c = s + wa[BB[j+1]]; d = SS[j]-h; if (e-m > c) c = e-m; if (d-g > c) c = d-g; if (c > d) d = c; if (c > (e = e-h)) e = c; s = RR[j]; RR[j] = c; SS[j] = d; } } SS[J2] = t; /* Determine the rightmost crossing edge above subproblem 2 */ wa = w[AA[midi1]]; midj1 = midj; for (j = midj+1; j <= J2; ++j) { if (CC[j-1]+RR[j]+wa[BB[j]] == opt) { midj1 = j; } if ((DD[j]+SS[j]-h == opt) || (CC[j]+SS[j]-m == opt)) { midj1 = j; } } /* Uniqueness bits for these columns' insertion edges are 0 */ for (j = midj+1; j <= midj1; ++j) UC[j] = 0; /* Subproblem 2 */ s = VC[midi1-1]; c = VC[midi1]; e = VE[midi1]; wa = w[AA[midi1]]; for (j = midj; j <= J2; ++j) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[BB[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } else { /* opt_num <= 1 */ /* Subproblem 2 */ t = max(CC[J1]-g,DD[J1]); s = CC[J1]; CC[J1] = c = t = t - h; e = t-g; wa = w[AA[midi1]]; for (j = J1+1; j <= J2; ++j) { if ((c = c - m) > (e = e - h)) e = c; if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; c = s + wa[BB[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } DD[J1] = CC[J1]; } } /* End of phase 3 */ /* Setup ICC, IDD, IRR, ISS */ tc = CC[midj]; td = DD[midj]; tr = IRR[midj]; ts = ISS[midj]; for (j = J1; j <= midj; ++j) { IRR[j] = RR[j]; ISS[j] = SS[j]; } for (j = midj+1; j <= J2; ++j) { ICC[j] = CC[j]; IDD[j] = DD[j]; } /* printf("midi=%d, midi1=%d, midj=%d, midj1=%d, type=%d, opt_num=%d\n",midi,midi1,midj,midj1,type,opt_num); fflush(stdout); */ } /* Phase 4: recursively solve subproblems */ if (type == 1) unibits(I1,J1,midi,midj-1); else unibits(I1,J1,midi,midj); ICC[midj] = tc; IDD[midj] = td; IRR[midj] = tr; ISS[midj] = ts; unibits(midi1,midj,I2,J2); } /* unibits_init - initializations for unibits() */ unibits_init(M,N) int M, N; { register int i, j; long t; ICC[0] = 0; IDD[0] = -g; t = -g; for (j = 1; j <= N; ++j) { ICC[j] = t = t-h; IDD[j] = t-g; } IRR[N] = ISS[N] = 0; t = -g; for (j = N-1; j >= 0; --j) { IRR[j] = ISS[j] = t = t-h; } for (i = 1; i <= M; ++i) UR[i] = 0; for (j = 1; j <= N; ++j) UC[j] = 1; } /* Interface and top level of comparator */ long ALIGN(A,B,M,N,W,G,H,S) char A[],B[]; int M,N; long W[][128],G,H; int S[]; { int i, j; long ck; char *ckalloc(); AA = A; /* Setup global parameters */ BB = B; w = W; g = G; h = H; m = g+h; sapp = S; last = 0; UR = (char *) ckalloc(M+1); UC = (char *) ckalloc(N+1); j = (N+1) * sizeof(long); CC = (long *) ckalloc(j); DD = (long *) ckalloc(j); RR = (long *) ckalloc(j); SS = (long *) ckalloc(j); ICC = (long *) ckalloc(j); IDD = (long *) ckalloc(j); IRR = (long *) ckalloc(j); ISS = (long *) ckalloc(j); j = (M+1) * sizeof(long); VC = (long *) ckalloc(j); VE = (long *) ckalloc(j); VR = (long *) ckalloc(j); VT = (long *) ckalloc(j); opt = align(A,B,M,N,-g,-g); /* OK, do it */ ck = CHECK_SCORE(A,B,M,N,S); if (opt != ck) printf("Check_score error.\n"); unibits_init(M,N); unibits(0,0,M,N); return opt; } /* Alignment display routine */ static char ALINE[51], BLINE[51], CLINE[51]; long DISPLAY(A,B,M,N,S,AP,BP) char A[], B[]; long M, N; long S[], AP, BP; { register char *a, *b, *c; register long i, j, op; long lines, ap, bp; i = j = op = lines = 0; ap = AP; bp = BP; a = ALINE; b = BLINE; c = CLINE; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; *a = A[++i]; *b = B[++j]; if (UR[i] == 0) *c++ = (*a++ == *b++) ? '|' : ' '; else *c++ = (*a++ == *b++) ? '*' : '!'; } else { if (op == 0) op = *S++; if (op > 0) { *a++ = ' '; *b++ = B[++j]; *c++ = (UC[j] == 0) ? '-' : '='; op--; } else { *a++ = A[++i]; *b++ = ' '; *c++ = (UR[i] == 0) ? '-' : '='; op++; } } if (a >= ALINE+50 || i >= M && j >= N) { *a = *b = *c = '\0'; printf("\n%5d ",50*lines++); for (b = ALINE+10; b <= a; b += 10) printf(" . :"); if (b <= a+5) printf(" ."); printf("\n%5d %s\n %s\n%5d %s\n",ap,ALINE,CLINE,bp,BLINE); ap = AP + i; bp = BP + j; a = ALINE; b = BLINE; c = CLINE; } } } /* CHECK_SCORE - return the score of the alignment stored in S */ static long CHECK_SCORE(A,B,M,N,S) char A[], B[]; long M, N; long S[]; { register long i, j, op; long score; score = i = j = op = 0; while (i < M || j < N) { op = *S++; if (op == 0) score = w[A[++i]][B[++j]] + score; else if (op > 0) { score = score - (g+op*h); j = j+op; } else { score = score - (g-op*h); i = i-op; } } return(score); } /* ROBUST - return robustness measure */ long ROBUST(A,B,M,N,S,OPT) char A[], B[]; long M, N; long S[]; long OPT; { register long i, j, k, op; long score, rc, ac; printf("\n\n"); ac = rc = score = i = j = op = 0; while (i < M || j < N) { op = *S++; if (op == 0) { ac++; ++i; ++j; if (UR[i] != 0) rc++; /* score = ralign(A,B,M,N,i,j,0); if (OPT - score != 0) { rc++; printf("Spair(%6d,%6d): robustness measure= %.1f, ", i,j,((float)(OPT-score))/DIGIT); printf("ur = %d\n",UR[i]); } else { printf("Spair(%6d,%6d): not robust, ", i,j); printf("ur = %d\n",UR[i]); } */ } else if (op > 0) { for (k = 1; k <= op; ++k) { ac++; ++j; if (UC[j] != 0) rc++; /* score = ralign(A,B,M,N,i,j,2); if (OPT - score != 0) { rc++; printf("Ipair(%6d,%6d): robustness measure= %.1f, ", i,j,((float)(OPT-score))/DIGIT); printf("uc = %d\n",UC[j]); } else { printf("Ipair(%6d,%6d): not robust, ", i,j); printf("uc = %d\n",UC[j]); } */ } } else { for (k = 1; k <= -op; ++k) { ac++; ++i; if (UR[i] != 0) rc++; /* score = ralign(A,B,M,N,++i,j,1); if (OPT - score != 0) { rc++; printf("Dpair(%6d,%6d): robustness measure= %.1f, ", i,j,((float)(OPT-score))/DIGIT); printf("ur = %d\n",UR[i]); } else { printf("Dpair(%6d,%6d): not robust, ", i,j); printf("ur = %d\n",UR[i]); } */ } } } if (ac == 0) printf("No diagonal aligned pairs\n"); else { printf("Percentage of robust pairs is %3d %% \n",(int)(((float)rc/(float)ac)*100.0)); } } static long ralign(A,B,M,N,I,J,type) char *A, *B; int M, N, I, J; char type; { register int i, j; register long c, e, d, s; long t, *wa; CC[0] = 0; t = -g; for (j = 1; j <= N; j++) { if (type == 2 && I == 0 && J == j) t = MININT; CC[j] = t = t-h; DD[j] = t-g; } t = -g; for (i = 1; i <= M; i++) { s = CC[0]; if (type == 1 && J == 0 && I == i) t = MININT; CC[0] = c = t = t-h; e = t-g; wa = w[A[i]]; for (j = 1; j <= N; j++) { if (type == 2 && i == I && j == J) c = e = MININT; else if ((c = c - m) > (e = e - h)) e = c; if (type == 1 && i == I && j == J) d = MININT; else if ((c = CC[j] - m) > (d = DD[j] - h)) d = c; if (type == 0 && i == I && j == J) c = e; else c = s + wa[B[j]]; if (e > c) c = e; if (d > c) c = d; s = CC[j]; CC[j] = c; DD[j] = d; } } return CC[N]; } /* lib.c - library of C procedures. */ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) long amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }