/* * This program reads two DNA sequences from files and finds their local * similarities. It can detect alignments containing gaps, while striking a * balance between speed and sensitivity. The algorithm is based on three ideas: * (1) techniques developed by Galil and coworkers that considerably improve * time efficiency, * (2) modification of an approach of Hirschberg that dramatically improves * space efficiency, and * (3) techniques that allow K best alignments to be determined efficiently * for K > 1. * * * The basic command syntax is * * FKLG K SEQ1 SEQ2 * * where K is the number of best local alignments we need, and * SEQ1 and SEQ2 name files containing DNA sequences. * * The FKLG command permits optional additional arguments that reset certain * internal parameters. The available parameters are: * * R or r gives the score for replacement. * O or o gives the score for opening up a gap. * E or e gives the score for extending one more symbol in a gap. * W or w gives the word size (<=MAXW). * (cost of a gap of length L is O + E*L * defaults: R = 0.2, O = 6.0, E = 0.2, W = 5) * * Thus a typical command is * * FKLG K SEQ1 SEQ2 R=0.2 O=3.0 E=0.5 W=5 * * ODDITY: any character other than A, C, G or T is changed to 'A'. */ #include #include "list.h" #include "db_list.h" #define ALPHABET_SIZE 4 /* number of letters in the alphabet */ #define DEFAULT_IR 2 /* default score for replacement (0.2) */ #define DEFAULT_IO 60 /* default score for opening up a gap (6.0) */ #define DEFAULT_IE 2 /* default score for gap extension (0.2) */ #define DEFAULT_W 5 /* default word size */ #define MAXW 8 /* maximum word size */ #define DIGIT 10.0 /* all parameters are accurate to the tenth */ #define GC_ROW 200 /* garbage collection frequency */ #define MININT -99999 /* minimum value */ #define false 0 #define true 1 #define INTER_ADD(c) \ { if (!cintersect[in_row]) \ cintersect[in_row] = idb_newList(); \ db_insert(cintersect[in_row],c,NULL); \ ccol_int[c] = in_row; \ } typedef struct string5 { long pos; /* starting position */ struct string5 *link; /* previous location with same 5-mer */ } string5; typedef struct cut_frag { /* cross fragment */ fragment *ff; /* pointer to the forward fragment */ fragment *bf; /* pointer to the backward fragment */ } cut_frag; typedef struct frag_pt { /* endpoint of the fragment */ fragment *fp; /* pointer to the fragment */ int col; /* end column of the fragment */ struct frag_pt *link; } frag_pt; int *bucket; /* hash table for rowwise computation */ int *rcbucket; /* hash table for columnwise computation */ int *bls; int *rcbls; int *blink; int *rcblink; char *seq1, *seq2; int W; /* word size */ int BUCKETS; /* hash table size (ALPHABET_SIZE ** W) */ int len1, len2; /* lengths of sequences */ int offset, offset1; /* offset to make index positive */ int number_frags; /* number of fragments generated */ int deleted_frags; /* number of fragments deleted */ int output_frags; /* number of fragments output */ int old_i_start, old_i_end, old_j_start, old_j_end; /* WEBB */ /* [sb1,sb2]X[se1,se2] is the area for aligning globally */ int sb1, se1, sb2, se2, mid1, mid2, cur_end, cur_end1; frag_pt **end_row; /* lists of endpoints row by row */ frag_pt **end_col; /* lists of endpoints column by column */ fragment **diag_prev; /* store the previous fragment on the same diagonal */ fragment **bdiag_prev; /* store the previous fragment on the same diagonal */ int *col_int; /* store the row position of intersection point */ int *rccol_int; /* store the row position of intersection point */ char *diag_flag; /* 1 if some intersection point is on the diagonal */ char *rcdiag_flag; /* 1 if some intersection point is on the diagonal */ idb_list *intersect; /* list for intersection points */ idb_list *rcintersect; /* list for intersection points */ ulist *used_frag; /* mark used fragments */ cut_frag **cut; /* store cross fragments */ int K; /* output K best local alignments */ long IR,IO,IE; /* cost of a gap of length L is IO/DIGIT + (IE/DIGIT)*L */ fragment *pfo, *pbo; /* pseudo origins */ fragment *pff, *pbf; /* pseudo aligning fragments */ int best_score = 0; /* best score in the recomputation */ list ri; /* right influence list */ list bri; /* backward right influence list */ db_list li_c, li_d; /* left influence lists */ db_list bli_c, bli_d; /* backward left influence lists */ typedef struct kbclass { /* k best equivalent classes */ fragment *first; /* first fragment in the alignment */ fragment *last; /* last fragment in the alignment */ int score; /* best score for the class */ int t; /* [t,b]X[l,r] contains all points whose */ int b; /* score >= mw in the class */ int l; int r; } kbclass; kbclass **S; /* store k best classes */ int lastS = 0; /* store number of classes */ int curS = 0; /* locality usage */ int minS = 0; /* store the class whose best score is mw */ int mw = 0; /* minimum score of all classes */ kbclass *findmaxS(); int pt_lscore(); int rcpt_lscore(); frag_pt *tailor(); frag_pt *otr_survive(); frag_pt *otl_survive(); int pos_diag(); int rcpos_diag(); int rotl_win(); int cotl_win(); main(int argc, char *argv[]) { double atof(); int i, j, tw; char *ckalloc(); if (argc < 4) fatalf("%s K seq1 seq2 [R=] [O=] [E=] [W=] (all > 0)" ,argv[0]); IR = DEFAULT_IR; IO = DEFAULT_IO; IE = DEFAULT_IE; W = DEFAULT_W; while (--argc > 3) { if (argv[argc][1] != '=') fatalf("argument %d has improper form", argc); switch (argv[argc][0]) { case 'R': case 'r': IR = DIGIT*atof(argv[argc]+2); break; case 'O': case 'o': IO = DIGIT*atof(argv[argc]+2); break; case 'E': case 'e': IE = DIGIT*atof(argv[argc]+2); break; case 'W': case 'w': tw = atoi(argv[argc]+2); if (tw <= 0) fatal("W <= 0 is not allowed."); W = (tw <= MAXW)? tw : MAXW; break; default: fatal("Options are R, O, E, W."); } } #ifdef STATS printf("#:lav\n\n"); #endif len1 = get_seq(argv[2], &seq1); len2 = get_seq(argv[3], &seq2); K = atoi(argv[1]); if (IR > 2*IE) fatal("repl_cost should not be greater than 2*gap_extend"); if (K<0 || IR<0 || IO<0 || IE<0) fatal("error! all input should be nonnegative."); S = (kbclass **)ckalloc((K+1)*sizeof(kbclass *)); BUCKETS = power(ALPHABET_SIZE,W); j = BUCKETS*sizeof(int); bucket = (int *)ckalloc(j); bls = (int *)ckalloc(j); blink = (int *)ckalloc((len2+1)*sizeof(int)); rcbucket = (int *)ckalloc(j); rcbls = (int *)ckalloc(j); rcblink = (int *)ckalloc((len1+1)*sizeof(int)); for (i=0; i=0; --i) { diag_prev[i] = NULL; bdiag_prev[i] = NULL; diag_flag[i] = 0; rcdiag_flag[i] = 0; } if (strsame(argv[2],argv[3])) { /* align sequence with itself */ used_frag[0] = unewList(); uinsert(used_frag[0],0); } #ifdef STATS printf("d {\n\"CHAO output with parameters:\n"); printf("W = %d, ",W); printf("K = %d, M = 1, I = V = %.1f,\nO = %.1f, E = %.1f\"\n}\n", K, ((float)IR)/DIGIT, ((float)IO)/DIGIT, ((float)IE)/DIGIT); printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n", argv[2], len1, argv[3], len2); #endif printf("%s %d\n%s %d\n", argv[2], len1, argv[3], len2); printf("%d\n",K); setup(); scan(); #ifdef STATS printf("\nnumber_frags= %d\n",number_frags); #endif free_all(len1-1,0,len2-1,diag_prev,ri,li_c,li_d); class_print(); #ifdef STATS printf("\nnumber_frags= %d\n",number_frags); printf("deleted_frags= %d\n",deleted_frags); #endif exit(0); } /* get_seq - read a sequence */ int get_seq(char *file_name, char **seqptr) { FILE *qp, *ckopen(); char *p, *fgets(), *strchr(), *ckalloc(); int c, n; qp = ckopen(file_name, "r"); for (n=0; (c=getc(qp)) != EOF;) if (c != '\n') ++n; *seqptr = ckalloc(n+1); rewind(qp); p = *seqptr; for (p = *seqptr; ; ) { if (fgets(p, (int)n, qp) == NULL) { fclose(qp); *p = '\0'; break; } if (p == *seqptr && !isupper(*p)) continue; while (isupper(*p)) { if (strchr("ACGT", *p) == NULL) *p = 'A'; ++p; } if (*p != '\n' && *p != '\0') { fprintf(stderr, "Illegal character %c in file %s.\n", *p, file_name); exit(1); } } *p = '\0'; return p - *seqptr; } int encoding[128]; int mask; /* hash table initialization */ table_init() { /* perform initializations */ encoding['C'] = 1; encoding['G'] = 2; encoding['T'] = 3; mask = (1 << (W+W-2)) - 1; } /* build hash table */ fbld_table() { long ecode; int i, pos; char *q, *endp; int s5; char *ckalloc(); q = seq2 + sb2 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++q]; pos = sb2; endp = seq2 + se2; while (++q <= endp) { ecode = ((ecode & mask) << 2) + encoding[*q]; s5 = pos++; if (bucket[ecode] != -1) { blink[bls[ecode]] = s5; } else { bucket[ecode] = s5; } blink[s5] = -1; bls[ecode] = s5; } } /* build hash table */ bbld_table() { long ecode; int i, pos; char *q, *endp; int s5; char *ckalloc(); q = seq2 + se2 + 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*--q]; pos = se2; endp = seq2 + sb2; while (--q >= endp) { ecode = ((ecode & mask) << 2) + encoding[*q]; s5 = pos--; if (bucket[ecode] != -1) { blink[bls[ecode]] = s5; } else { bucket[ecode] = s5; } blink[s5] = -1; bls[ecode] = s5; } } /* build hash table */ long rcbld_table(int *buc, int *cbls, int *cblink, char *seq, int b, int e) { long ecode; int i, pos; char *q, *endp; int s5; char *ckalloc(); q = seq + e + 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*--q]; pos = e; endp = (b-W+1 >= 0)? seq + b-W+1 : seq; while (--q >= endp) { ecode = ((ecode & mask) << 2) + encoding[*q]; s5 = pos--; if (buc[ecode] != -1) { cblink[cbls[ecode]] = s5; } else { buc[ecode] = s5; } cblink[s5] = -1; cbls[ecode] = s5; } return(ecode); } /* ex_table - extend the hash table */ long ex_table(int *buc, int *cbls, int *cblink, char *seq, int pos,long ecode) { char *q; char *ckalloc(); q = seq + (pos-W+1); ecode = ((ecode & mask) << 2) + encoding[*q]; if (buc[ecode] != -1) { cblink[cbls[ecode]] = pos; } else { buc[ecode] = pos; } cblink[pos] = -1; cbls[ecode] = pos; return(ecode); } /* free_table - free the memory occupied by the table */ free_table(int *buc) { int i; for (i = 0; i < BUCKETS; ++i) { buc[i] = -1; } } /* update_active - update active region */ update_active(list cri, db_list cli_c, db_list cli_d, int l) { /* update left influence active regions */ db_reset_pos(cli_c); db_reset_pos(cli_d); resolve(cli_c,cli_d,intersect,diag_flag,col_int,l,pt_lscore); db_reset_pos(cli_c); db_reset_pos(cli_d); l_merge(cli_c,cli_d,intersect,diag_flag,col_int,end_row[l],l,pt_lscore); /* update right influence active regions */ reset_pos(cri); r_survive(end_row[l]); r_merge(cri,end_row[l],l); if (intersect[l]) { idb_freeList(intersect[l]); intersect[l] = NULL; } end_row[l] = NULL; } /* rcupdate_active - update active region */ rcupdate_active(list cri, db_list cli_c, db_list cli_d, int l) { /* update left influence active regions */ db_reset_pos(cli_c); db_reset_pos(cli_d); resolve(cli_c,cli_d,rcintersect,rcdiag_flag,rccol_int,l,rcpt_lscore); db_reset_pos(cli_c); db_reset_pos(cli_d); l_merge(cli_c,cli_d,rcintersect,rcdiag_flag,rccol_int,end_col[l],l,rcpt_lscore); /* update right influence active regions */ reset_pos(cri); r_survive(end_col[l]); rcr_merge(cri,end_col[l],l); if (rcintersect[l]) { idb_freeList(rcintersect[l]); rcintersect[l] = NULL; } end_col[l] = NULL; } /* rotupdate_active - update active regions orthogonally */ rotupdate_active(frag_pt *rcopy, int col, int row) { frag_pt *r; reset_pos(ri); r = otr_survive(rcopy,col); rotr_merge(r); r = otl_survive(rcopy,col); otl_merge(r,col,row,li_c,li_d,rcintersect,rccol_int, rcdiag_flag,rcpos_diag,rotl_win); } /* cotupdate_active - update active regions orthogonally */ cotupdate_active(frag_pt *rcopy, int col, int row) { frag_pt *r; reset_pos(bri); r = otr_survive(rcopy,col); cotr_merge(r); r = otl_survive(rcopy,col); otl_merge(r,col,row,bli_c,bli_d,intersect,col_int, diag_flag,pos_diag,cotl_win); } /* free_all - free all useless space */ free_all(int i, int j1, int j2, fragment *cdiag_prev[], list cri, db_list cli_c, db_list cli_d) { int l, m; fragment *df; m = len1 + (j2 - i); for (l = len1+(j1-i); l <= m; ++l) if (df = cdiag_prev[l]) { decre(df); cdiag_prev[l] = NULL; } freeList(cri); db_freeList(cli_c); db_freeList(cli_d); } scan() /* scan sequence 1 */ { long ecode; int i, l, m, pos; char *q; int s5; fragment *df; q = seq1 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++q]; pos = 0; while (*++q) { reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); ecode = ((ecode & mask) << 2) + encoding[*q]; for (s5 = bucket[ecode]; s5 != -1; s5 = blink[s5]) found(pos, s5); /* free part of space occupied by diag_prev[] */ if (pos % GC_ROW == 0) { m = len1 - pos + len2; for (l = len1-pos; l <= m; ++l) { if ((df = diag_prev[l]) && (df->score - IR * (pos - df->i - df->k) < 0)) { decre(df); diag_prev[l] = NULL; } } } /* update active regions */ update_active(ri,li_c,li_d,pos); if (df = diag_prev[len1+len2-1-pos]) { diag_prev[len1+len2-1-pos] = NULL; decre(df); } ++pos; } for (l = pos; l < len1; ++l) update_active(ri,li_c,li_d,l); m = len2; /* len1 + (len2-len1) */ for (l = len1+(len2-1-pos); l>m; --l) if (df=diag_prev[l]) { decre(df); diag_prev[l]=NULL; } } fscan_init() { int i,j; ri = newList(); li_c = db_newList(); li_d = db_newList(); for (i=sb2; i<=se2; ++i) col_int[i] = 0; j = len1 + (se2-sb1); for (i = len1 + (sb2-mid1); i <= j; ++i) diag_flag[i]=0; cur_end = mid1; } bscan_init() { int i,j; bri = newList(); bli_c = db_newList(); bli_d = db_newList(); j=len2-sb2; for (i=len2-se2; i<=j; ++i) col_int[i] = 0; j=len2-(sb2-se1); for (i = len2-(se2-mid2); i<=j; ++i) diag_flag[i]=0; cur_end = len1 - mid2; } fscan() /* scan sequence 1 */ { long ecode; int i, pos; char *q; int s5; fragment *df; q = seq1 + sb1 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++q]; pos = sb1; while (*++q && pos <= mid1) { reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); ecode = ((ecode & mask) << 2) + encoding[*q]; for (s5 = bucket[ecode]; s5 != -1; s5 = blink[s5]) ffound(pos, s5); /* update active regions */ update_active(ri,li_c,li_d,pos); if ((pos != mid1) && (df = diag_prev[len1+se2-pos])) { decre(df); diag_prev[len1+se2-pos] = NULL; } ++pos; } } bscan() /* scan sequence 1 */ { long ecode; int i, pos, dd; char *q; int s5; fragment *df; q = seq1 + se1 + 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*--q]; pos = se1; while (*--q && pos >= mid2) { reset_pos(bri); db_reset_pos(bli_c); db_reset_pos(bli_d); ecode = ((ecode & mask) << 2) + encoding[*q]; for (s5 = bucket[ecode]; s5 != -1; s5 = blink[s5]) bfound(pos, s5); /* update active regions */ update_active(bri,bli_c,bli_d,len1-pos); dd = len2-sb2+pos; if ((pos != mid2) && (df = bdiag_prev[dd])) { bdiag_prev[dd] = NULL; decre(df); } --pos; } while (pos >= mid2) { /* update active regions */ update_active(bri,bli_c,bli_d,len1-pos); dd = len2-sb2+pos; if ((pos != mid2) && (df = bdiag_prev[dd])) { bdiag_prev[dd] = NULL; decre(df); } --pos; } } /* found a matching 5-mer at (i,j) */ found(int i, int j) { int k; fragment *f; char *ckalloc(); if (i && j && seq1[i-1] == seq2[j-1]) return; if (usearch(used_frag[i],j)) return; for (k = W; seq1[i+k] && seq2[j+k] && seq1[i+k] == seq2[j+k]; ++k) ; f = (fragment *) ckalloc(sizeof(fragment)); f->i = i; f->j = j; f->k = k; f->ref = 0; local(f); enter_endrow(f); ++number_frags; } /* rfscan - recompute the given area: [t,b]X[l,r] */ rfscan(int t, int b, int l, int r) { long ecode; int i, pos; char *q; int s5; fragment *df; if (b-t+1 < W || r-l+1 < W) return; sb1 = t; se1 = mid1 = b; sb2 = l; se2 = r; free_table(bucket); fbld_table(); fscan_init(); q = seq1 + sb1 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++q]; pos = sb1; while (*++q && pos <= mid1) { reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); ecode = ((ecode & mask) << 2) + encoding[*q]; for (s5 = bucket[ecode]; s5 != -1; s5 = blink[s5]) rffound(pos, s5); /* update active regions */ update_active(ri,li_c,li_d,pos); if (df = diag_prev[len1+se2-pos]) { decre(df); diag_prev[len1+se2-pos] = NULL; } ++pos; } while (pos<=mid1) { update_active(ri,li_c,li_d,pos); if (df = diag_prev[len1+se2-pos]) { decre(df); diag_prev[len1+se2-pos] = NULL; } ++pos; } free_all(mid1,sb2,se2,diag_prev,ri,li_c,li_d); } /* rffound a matching 5-mer at (i,j) */ rffound(int i, int j) { int k; fragment *f; char *ckalloc(); if (i && j && seq1[i-1] == seq2[j-1]) return; if (usearch(used_frag[i],j)) return; for (k = W; seq1[i+k] && seq2[j+k] && seq1[i+k] == seq2[j+k]; ++k); if (i+k-1 > se1 || j+k-1 > se2) return; /* out of boundary */ f = (fragment *) ckalloc(sizeof(fragment)); f->i = i; f->j = j; f->k = k; f->ref = 0; local(f); enter_endrow(f); ++number_frags; } /* enter_endrow - enter end_point of the given fragment */ enter_endrow(fragment *f) { int row,col; frag_pt *e, *e1, *e2; char *ckalloc(); row = f->i + f->k - 1; col = f->j + f->k - 1; /* put each fragment row by row according to its end point */ e = (frag_pt *) ckalloc(sizeof(frag_pt)); e->fp = f; e->col = col; if (end_row[row]) { /* end_row[row] isn't empty */ e1 = end_row[row]; e2 = e1->link; if (e1->col > e->col) { /* e contains the smallest column # */ e->link = e1; end_row[row] = e; } else { while (e2 && e2->col < e->col) { e1 = e2; e2 = e2->link; } e1->link = e; e->link = e2; } } else { /* end_row[row] is empty */ e->link = end_row[row]; end_row[row] = e; } } /* enter_endcol - enter end_point of the given fragment */ enter_endcol(fragment *f) { int row,col; frag_pt *e, *e1, *e2; char *ckalloc(); row = f->i + f->k - 1; col = f->j + f->k - 1; /* put each fragment row by row according to its end point */ e = (frag_pt *) ckalloc(sizeof(frag_pt)); e->fp = f; e->col = row; if (end_col[col]) { /* end_col[col] isn't empty */ e1 = end_col[col]; e2 = e1->link; if (e1->col > e->col) { /* e contains the smallest column # */ e->link = e1; end_col[col] = e; } else { while (e2 && e2->col < e->col) { e1 = e2; e2 = e2->link; } e1->link = e; e->link = e2; } } else { /* end_col[col] is empty */ e->link = end_col[col]; end_col[col] = e; } } /* ffound a matching 5-mer at (i,j) */ ffound(int i, int j) { int k; fragment *f; char *ckalloc(); if (i && j && seq1[i-1] == seq2[j-1]) return; if (usearch(used_frag[i],j)) return; for (k = W; seq1[i+k] && seq2[j+k] && seq1[i+k] == seq2[j+k]; ++k); if (i+k-1 > se1 || j+k-1 > se2) return; /* out of boundary */ f = (fragment *) ckalloc(sizeof(fragment)); f->i = i; f->j = j; f->k = k; f->ref = 0; ++number_frags; if (i+k-1>mid1) { cut[mid1+j-i] = (cut_frag *) ckalloc(sizeof(cut_frag)); cut[mid1+j-i]->ff = f; calign(pfo,diag_prev,ri,li_c,li_d,f); return; } else align(pfo,diag_prev,ri,li_c,li_d,f); enter_endrow(f); } /* bfound a matching 5-mer at (i,j) */ bfound(int i, int j) { int k; fragment *f; char *ckalloc(); if (seq1[i+1] && seq2[j+1] && seq1[i+1] == seq2[j+1]) return; /* for (k = W; seq1[i-k] && seq2[j-k] && seq1[i-k] == seq2[j-k]; ++k); */ for (k = W; i-k>=0 && j-k>=0 && seq1[i-k] == seq2[j-k]; ++k); if (usearch(used_frag[i-k+1],j-k+1)) return; if (i-k+1 < sb1 || j-k+1 < sb2) return; /* out of boundary */ f = (fragment *) ckalloc(sizeof(fragment)); f->i = len1 - i; f->j = len2 - j; f->k = k; f->ref = 0; ++number_frags; if (i-k+1 < mid2) { cut[mid1+(j-i)]->bf = f; calign(pbo,bdiag_prev,bri,bli_c,bli_d,f); return; } else align(pbo,bdiag_prev,bri,bli_c,bli_d,f); enter_endrow(f); } /* setup - initialization */ setup() { char *ckalloc(); /* skip lists initialization */ init(); db_init(); /* hash table initialization */ table_init(); /* pseudo fragments initialization */ pfo = (fragment *) ckalloc(sizeof(fragment)); pfo->k = 0; pfo->score = 0; pfo->bgf = pfo; pbo = (fragment *) ckalloc(sizeof(fragment)); pbo->k = 0; pbo->score = 0; pbo->bgf = pbo; pff = (fragment *) ckalloc(sizeof(fragment)); pff->k = 0; pbf = (fragment *) ckalloc(sizeof(fragment)); pbf->k = 0; /* initialization */ ri = newList(); li_c = db_newList(); li_d = db_newList(); sb2 = 0; se2 = len2 - 1; cur_end = len1 - 1; fbld_table(); pfo->ref = 1; pbo->ref = 1; offset = offset1 = len1; } global_setup(fragment *bf, fragment *ef) { sb1 = bf->i + bf->k; se1 = ef->i - 1; sb2 = bf->j + bf->k; se2 = ef->j - 1; mid1 = (sb1 + se1) / 2; mid2 = mid1 + 1; pfo->i = sb1; pfo->j = sb2; pbo->i = len1-se1; pbo->j = len2-se2; } /* global - find the best global alignment in a given area */ global(fragment *bf, fragment *ef, int gs) { int i, ll; fragment *fdf, *bdf; fragment *fsb, *fse, *bsb, *bse, *mf; int js; global_setup(bf,ef); if ((se1-sb1+1 < W) || (se2-sb2+1 < W)) return; free_table(bucket); fbld_table(); fscan_init(); fscan(); free_table(bucket); bbld_table(); bscan_init(); bscan(); pf_set(pff,mid2,se2+1); pf_set(pbf,len1-mid1,len2-se2); mf = NULL; reset_pos(bri); db_reset_pos(bli_c); db_reset_pos(bli_d); ll = se2-sb2+1; for (i = 0; i <= ll; ++i) { if (cut[pff->j]) { fdf = cut[pff->j]->ff; bdf = cut[pff->j]->bf; js = join(fdf,bdf)-DIGIT*(bdf->k); if (js == gs) { mf = fdf; incre(mf); fse = fdf->bgf; incre(fse); bse = bdf->bgf; incre(bse); break; } } reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); malign(pfo,diag_prev,ri,li_c,li_d,pff); malign(pbo,bdiag_prev,bri,bli_c,bli_d,pbf); js = join(pff,pbf); if ((pff->j-pff->i != (pff->bgf)->j-(pff->bgf)->i) && (pbf->j-pbf->i != (pbf->bgf)->j-(pbf->bgf)->i)) js = js + IO; if (js == gs) { fse = pff->bgf; bse = pbf->bgf; incre(fse); incre(bse); break; } --(pff->j); ++(pbf->j); } if (js != gs) { /* type 2 connection */ reset_pos(bri); db_reset_pos(bli_c); db_reset_pos(bli_d); pf_set(pff,mid2,se2+1); pf_set(pbf,len1-mid1,len2-se2); ll = se2-sb2+1; for (i = 0; i <= ll; ++i) { reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); malign1(pfo,ri,li_c,li_d,pff); malign1(pbo,bri,bli_c,bli_d,pbf); js = join(pff,pbf); if ((pff->j-pff->i != (pff->bgf)->j-(pff->bgf)->i) && (pbf->j-pbf->i != (pbf->bgf)->j-(pbf->bgf)->i)) js = js + IO; if (js == gs) { fse = pff->bgf; bse = pbf->bgf; incre(fse); incre(bse); break; } --(pff->j); ++(pbf->j); } if (js != gs) { #ifdef STATS printf("wrong!!! \n"); printf("sb1=%d se1=%d mid1=%d \n",sb1,se1,mid1); printf("gs=%d, js=%d\n",gs,js); #endif free_all(mid1,sb2,se2,diag_prev,ri,li_c,li_d); free_all(len1-mid2,len2-se2,len2-sb2,bdiag_prev,bri,bli_c,bli_d); return; } } fsb = fse->bgf; if (fse == pfo) fse = NULL; if (fsb == pfo || fsb == fse) fsb = NULL; bsb = bse->bgf; if (bse == pbo) bse = NULL; else { bse->i = len1 - bse->i - bse->k + 1; bse->j = len2 - bse->j - bse->k + 1; } if (bsb == pbo || bsb == bse) bsb = NULL; else { bsb->i = len1 - bsb->i - bsb->k + 1; bsb->j = len2 - bsb->j - bsb->k + 1; } for (i=sb2; i<=se2; ++i) if (cut[i]) { decre(cut[i]->ff); decre(cut[i]->bf); free(cut[i]); cut[i]=NULL; } free_all(mid1,sb2,se2,diag_prev,ri,li_c,li_d); free_all(len1-mid2,len2-se2,len2-sb2,bdiag_prev,bri,bli_c,bli_d); if (fsb) { frag_print(fsb); gs = fse->score-fsb->score-DIGIT*fse->k; global(fsb,fse,gs); } if (fse) { frag_print(fse); decre(fse); } if (mf) { frag_print(mf); decre(mf); } if (bse) { frag_print(bse); if (bsb) { gs = bse->score-bsb->score-DIGIT*bse->k; global(bse,bsb,gs); frag_print(bsb); } decre(bse); } } /* join - return the cost after join */ int join(fragment *ff, fragment *bf) { return(ff->score + bf->score); } int tt,bb,ll,rr,tt1,ll1; /* current recomputation area */ int ecol,erow; long row_code, col_code; /* recom_setup - initialization for recomputation */ recom_setup(kbclass *sp) { int i,j; erow = tt1 = tt = sp->t; bb = sp->b; ecol = ll1 = ll = sp->l; rr = sp->r; free_table(bucket); free_table(rcbucket); col_code = rcbld_table(bucket,bls,blink,seq2,ll1,rr); row_code = rcbld_table(rcbucket,rcbls,rcblink,seq1,tt1,bb); /* Lists initialization for column-ward recomputation */ ri = newList(); li_c = db_newList(); li_d = db_newList(); /* Lists initialization for row-ward recomputation */ bri = newList(); bli_c = db_newList(); bli_d = db_newList(); for (i=len2-rr; i<=len2; ++i) col_int[i] = 0; for (i=len1-bb; i<=len1; ++i) rccol_int[i] = 0; j = len2+bb; for (i=len2-rr; i<=j; ++i) diag_flag[i] = 0; j = len1+rr; for (i=len1-bb; i<=j; ++i) rcdiag_flag[i] = 0; } /* recompute - recompute the scores of the area affected by removing sp */ recompute(kbclass *sp) { recom_setup(sp); offset = cur_end = len2; /* len2 - 0 */ offset1 = cur_end1 = len1; /* len1 - 0 */ trcscan(); for (;;) { while(tt>erow || ll>ecol) { offset = cur_end = len1; offset1 = cur_end1 = len2; while (tt>erow) ex_row(); offset = cur_end = len2; offset1 = cur_end1 = len1; while (ll>ecol) ex_col(); } if (disjoint() || (tt==0 && ll==0)) break; } rcfree(); offset = offset1 = len1; #ifdef STATS printf("before tt=%d, bb=%d, ll=%d, rr=%d\n",tt,bb,ll,rr); printf("best_score= %d\n",best_score); #endif if (best_score > minscoreS()) { syn_area(); #ifdef STATS printf("after tt=%d, bb=%d, ll=%d, rr=%d\n",tt,bb,ll,rr); #endif rfscan(tt,bb,ll,rr); } best_score = 0; } /* disjoint - return true if [tt,bb]X[ll,rr]-[tt1,bb]X[ll1,rr] shares no vertices with the areas bounded by S */ int disjoint() { kbclass *sp; int i,j; int endj; for (i=0; it <= bb && sp->b >= tt && sp->l <= rr && sp->r >= ll && (sp->t < tt1 || sp->l < ll1)) { if (sp->t < tt1) tt1 = sp->t; if (sp->l < ll1) ll1 = sp->l; if (erow > tt1) erow = tt1; if (ecol > ll1) ecol = ll1; /* extend erow and ecol */ endj = len2 + (bb-ll); for (j=len2+(tt-rr); j<=endj; ++j) ractive_ext(diag_prev[j]); sreset_pos(ri); sreset_pos(bri); db_sreset_pos(li_c); db_sreset_pos(li_d); db_sreset_pos(bli_c); db_sreset_pos(bli_d); while(next_key(ri)!=-1) ractive_ext(snext(ri)); while(next_key(bri)!=-1) ractive_ext(snext(bri)); while(db_next_key(li_c)!=-1) lactive_ext(db_snext(li_c)); while(db_next_key(li_d)!=-1) lactive_ext(db_snext(li_d)); while(db_next_key(bli_c)!=-1) lactive_ext(db_snext(bli_c)); while(db_next_key(bli_d)!=-1) lactive_ext(db_snext(bli_d)); return false; } } return true; } /* syn_area - synchronize [tt,bb]X[ll,rr] with all other areas */ syn_area() { kbclass *sp; int i,si,sj; int flag; flag = true; while (flag) { flag = false; for (i=0; ifirst)->i; sj = (sp->first)->j; if (si <= bb && sp->b >= tt && sj <= rr && sp->r >= ll && (si < tt || sj < ll)) { if (si < tt) tt = si; if (sj < ll) ll = sj; flag = true; } } } } /* ex_row - extend one row (incrementally) */ ex_row() { int s5; int tpos; --tt; if (tt-W+1 >= 0) { row_code = ex_table(rcbucket,rcbls,rcblink,seq1,tt,row_code); reset_pos(bri); db_reset_pos(bli_c); db_reset_pos(bli_d); for (s5 = bucket[row_code]; s5 != -1; s5 = blink[s5]) rcfound(tt, s5); } /* update active regions */ tpos = len1-tt; end_row[tpos] = tailor(end_row[tpos],len2-ll); rotupdate_active(end_row[tpos],len2-ll,tpos); update_active(bri,bli_c,bli_d,tpos); } /* ex_col - extend one col (incrementally) */ ex_col() { int s5; int tpos; --ll; if (ll-W+1 >= 0) { col_code = ex_table(bucket,bls,blink,seq2,ll,col_code); reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); for (s5 = rcbucket[col_code]; s5 != -1; s5 = rcblink[s5]) trcfound(s5,ll); } /* update active regions */ tpos = len2-ll; end_col[tpos] = tailor(end_col[tpos],len1-tt); cotupdate_active(end_col[tpos],len1-tt,tpos); rcupdate_active(ri,li_c,li_d,tpos); } /* rcfound a matching 5-mer at (i,j) */ rcfound(int i, int j) { int k; fragment *f; char *ckalloc(); if (seq1[i+1] && seq2[j+1] && seq1[i+1] == seq2[j+1]) return; /* for (k = W; seq1[i-k] && seq2[j-k] && seq1[i-k] == seq2[j-k]; ++k); */ for (k = W; i-k>=0 && j-k>=0 && seq1[i-k] == seq2[j-k]; ++k); if (usearch(used_frag[i-k+1],j-k+1)) return; f = (fragment *) ckalloc(sizeof(fragment)); f->i = len1 - i; f->j = len2 - j; f->k = k; f->ref = 0; rclocal(f,f->j,pos_diag(f),bri,bli_c,bli_d); if ((f->j + f->k - 1 <= len2-ll) || (f->i + f->k - 1 > len1-tt)) enter_endrow(f); if (f->j + f->k - 1 > len2-ll) enter_endcol(f); ++number_frags; } /* trcscan - scan seq1 and seq2 backward (columnwise) */ trcscan() { long ecode; int i, pos, tpos; char *q; int s5; q = seq2 + rr + 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*--q]; pos = rr; while (*--q && pos >= ll) { reset_pos(ri); db_reset_pos(li_c); db_reset_pos(li_d); ecode = ((ecode & mask) << 2) + encoding[*q]; for (s5 = rcbucket[ecode]; s5 != -1; s5 = rcblink[s5]) trcfound(s5,pos); /* update active regions */ tpos = len2-pos; end_col[tpos] = tailor(end_col[tpos],len1-tt); cotupdate_active(end_col[tpos],len1-tt,tpos); rcupdate_active(ri,li_c,li_d,tpos); --pos; } } /* trcfound a matching 5-mer at (i,j) */ trcfound(int i, int j) { int k; fragment *f; char *ckalloc(); if (seq1[i+1] && seq2[j+1] && seq1[i+1] == seq2[j+1]) return; /* for (k = W; seq1[i-k] && seq2[j-k] && seq1[i-k] == seq2[j-k]; ++k); */ for (k = W; i-k>=0 && j-k>=0 && seq1[i-k] == seq2[j-k]; ++k); if (usearch(used_frag[i-k+1],j-k+1)) return; f = (fragment *) ckalloc(sizeof(fragment)); f->i = len1 - i; f->j = len2 - j; f->k = k; f->ref = 0; rclocal(f,f->i,rcpos_diag(f),ri,li_c,li_d); if ((f->i + f->k - 1<= len1-tt) || (f->j + f->k - 1> len2-ll)) enter_endcol(f); if (f->i + f->k - 1> len1-tt) enter_endrow(f); ++number_frags; } rclocal(fragment *f, int cur_col, int cur_diag, list cri, db_list cli_c, db_list cli_d) { int max_score, cs; fragment *pf, *cf, *df; int col, diag, ocur_diag; max_score = 0.0; f->bgf = f; /* compute the best left influence */ cf = db_rsearch_next(cli_c, cur_col); col = db_cur_key(cli_c); df = db_lsearch_next(cli_d, cur_diag); diag = db_cur_key(cli_d); pf = ((cur_col - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; f->bgf = pf->bgf; incre(f->bgf); } } /* compute the best mismatch */ ocur_diag = pos_diag(f); if (pf = diag_prev[ocur_diag]) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } decre(pf); } diag_prev[ocur_diag] = f; incre(f); /* compute the best right influence */ pf = rsearch_next(cri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } } f->score = max_score + DIGIT * f->k; if (f->score > best_score) best_score = f->score; lactive_ext(f); } /* lactive_ext - check if we need to extend the area reached by active fragments (for all cases) */ lactive_ext(fragment *f) { int endi,endj,ai,aj,ai1,aj1,cur_score; if (!f) return; /* check if we need to extend the area reached by active fragments */ if ((f->bgf)->i <= len1-tt1 && (f->bgf)->j <= len2-ll1) { endi = f->i + f->k - 1; endj = f->j + f->k - 1; cur_score = f->score; ai = endi + cur_score / IR + 2; /* ai1 = endi + (cur_score-IO)/IE + 1; */ ai1 = endi + cur_score/IE + 2; if (ai1 > ai) ai = ai1; if (ai > len1) ai = len1; ai = len1-ai; if (ai < erow) erow = ai; aj = endj + cur_score / IR + 2; /* aj1 = endj + (cur_score-IO)/IE + 1; */ aj1 = endj + cur_score/IE + 2; if (aj1 > aj) aj = aj1; if (aj > len2) aj = len2; aj = len2-aj; if (aj < ecol) ecol = aj; } } /* ractive_ext - check if we need to extend the area reached by active fragments (for right influence list or mismatch) */ ractive_ext(fragment *f) { int endi,endj,ai,aj,cur_score; if (!f) return; /* check if we need to extend the area reached by active fragments */ if ((f->bgf)->i <= len1-tt1 && (f->bgf)->j <= len2-ll1) { endi = f->i + f->k - 1; endj = f->j + f->k - 1; cur_score = f->score; ai = endi + cur_score / IR + 2; if (ai > len1) ai = len1; ai = len1-ai; if (ai < erow) erow = ai; aj = endj + cur_score / IR + 2; if (aj > len2) aj = len2; aj = len2-aj; if (aj < ecol) ecol = aj; } } rcfree() { int i,j; frag_pt *p, *q; for (i=len1-tt+1; i<=len1; ++i) { if (intersect[i]) { idb_freeList(intersect[i]); intersect[i] = NULL; } if (end_row[i]) { for (p=end_row[i]; p; q=p, p=p->link, free(q)); end_row[i] = NULL; } } for (i=len2-ll+1; i<=len2; ++i) { if (rcintersect[i]) { idb_freeList(rcintersect[i]); rcintersect[i] = NULL; } if (end_col[i]) { for (p=end_col[i]; p; q=p, p=p->link, free(q)); end_col[i] = NULL; } } j = len2-ll+bb; for (i=len2-rr+tt; i<=j; ++i) if (diag_prev[i]) { decre(diag_prev[i]); diag_prev[i]=NULL; } freeList(ri); db_freeList(li_c); db_freeList(li_d); freeList(bri); db_freeList(bli_c); db_freeList(bli_d); } /* printS - print the given class */ printS(kbclass *sp) { int gs, len; #ifdef STATS printf("first: "); frag_print(sp->first); printf("last: "); frag_print(sp->last); printf("score = %.1f\n", ((float) sp->score)/DIGIT); printf("t = %d, b = %d, l = %d, r = %d\n\n", sp->t,sp->b,sp->l,sp->r); #endif #ifdef STATS old_i_end = 0; printf("a {\n s %.1f\n", ((float) sp->score)/DIGIT); len = sp->last->k - 1; printf(" b %d %d\n e %d %d\n", sp->first->i, sp->first->j, sp->last->i + len, sp->last->j + len); #endif printf("%.1f\n", ((float) sp->score)/DIGIT); frag_print(sp->first); frag_print(sp->last); if (sp->last) { if (sp->last != (sp->last)->bgf) { frag_print((sp->last)->bgf); gs = sp->score - DIGIT * (((sp->last)->bgf)->k + (sp->last)->k); global((sp->last)->bgf,sp->last,gs); } frag_print(sp->last); decre(sp->last); } #ifdef STATS printf(" l %d %d %d %d 0.0\n}\n", old_i_start, old_j_start, old_i_end, old_j_end); #endif } /* class_print - print K best local alignments */ class_print() { int i,j; kbclass *sp; j = lastS; for (i=0; ib,0,sp->r); else recompute(sp); } free(sp); } } /* findmaxS - return the class with the maximum score */ kbclass *findmaxS() { int i, ms, mi; kbclass *sp; mi = 0; ms = S[0]->score; for (i=1; iscore > ms) { ms = S[i]->score; mi = i; } sp = S[mi]; if (minS == lastS-1) minS = mi; if (lastS>1) { S[mi] = S[lastS-1]; S[lastS-1] = NULL; } --lastS; return(sp); } /* findS - find the equivalent class corresponding to f */ kbclass *findS(fragment *f) { int i, j; fragment *sf; for (i=curS, j=0; jfirst; if (sf->i == f->i && sf->j == f->j) { curS = i; return S[i]; } } return(NULL); } /* set_class - setup values for a new class */ set_class(int si, fragment *f) { kbclass *sp; int endi, endj, t1, t2; int endi1, endj1; sp = S[si]; sp->first = f->bgf; sp->last = f; sp->score = f->score; endi = f->i + f->k - 1; endj = f->j + f->k - 1; t1 = (f->score - mw) / IR; t2 = (f->score - mw - IO) / IE; if (t2 > t1) t1 = t2; endi1 = endi + t1; if (endi1 >= len1) endi1 = len1-1; if (endi1 < endi) endi1 = endi; endj1 = endj + t1; if (endj1 >= len2) endj1 = len2-1; if (endj1 < endj) endj1 = endj; sp->t = endi; sp->b = endi1; sp->l = endj; sp->r = endj1; curS = si; } /* replaceS - replace the min_score class */ replaceS(fragment *f) { int i; incre(f); decre(S[minS]->last); set_class(minS,f); for (i=0; iscore < S[minS]->score) minS = i; } /* insertS - insert a new class */ insertS(fragment *f) { char *ckalloc(); incre(f); S[lastS] = (kbclass *) ckalloc(sizeof(kbclass)); set_class(lastS,f); if (S[lastS]->score < S[minS]->score) minS = lastS; ++lastS; } /* sizeS - return the number of current equivalent classes */ int sizeS() { return lastS; } /* minscoreS - return the minimum score in all classes */ int minscoreS() { return S[minS]->score; } int enterS(fragment *f, int w, int l) { kbclass *s; int i; int endi, endj; int endi1, endj1, t1, t2; if (s = findS(f->bgf)) { if (s->score < f->score) { incre(f); decre(s->last); s->score = f->score; s->first = f->bgf; s->last = f; if (s == S[minS]) for (i=0; iscore < S[minS]->score) minS = i; } endi = f->i + f->k - 1; endj = f->j + f->k - 1; t1 = (f->score - mw) / IR; t2 = (f->score - mw - IO) / IE; if (t2 > t1) t1 = t2; endi1 = endi + t1; if (endi1 >= len1) endi1 = len1-1; if (endi1 < endi) endi1 = endi; endj1 = endj + t1; if (endj1 >= len2) endj1 = len2-1; if (endj1 < endj) endj1 = endj; if (s->t > endi) s->t = endi; if (s->b < endi1) s->b = endi1; if (s->l > endj) s->l = endj; if (s->r < endj1) s->r = endj1; } else { if (sizeS() == l) replaceS(f); else insertS(f); } if (sizeS() == l) return minscoreS(); else return w; } /* find the best local alignment */ local(fragment *f) { int max_score, cs; fragment *pf, *cf, *df; int cur_diag, col, diag; max_score = 0.0; cur_diag = pos_diag(f); f->bgf = f; /* compute the best left influence */ cf = db_rsearch_next(li_c, f->j); col = db_cur_key(li_c); df = db_lsearch_next(li_d, cur_diag); diag = db_cur_key(li_d); pf = ((f->j - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score || (cs == max_score && topo_win(f,pf))) { max_score = cs; f->bgf = pf->bgf; incre(f->bgf); } } /* compute the best mismatch */ if (pf = diag_prev[cur_diag]) { cs = pf->score - weight(pf, f); if (cs > max_score || (cs == max_score && topo_win(f,pf))) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } decre(pf); } diag_prev[cur_diag] = f; incre(f); /* compute the best right influence */ pf = rsearch_next(ri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score || (cs == max_score && topo_win(f,pf))) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } } f->score = max_score + DIGIT * f->k; /* update current equivalent classes */ if (f->score > mw) mw = enterS(f,mw,K); } /* find the best alignment */ align(fragment *pof, fragment *cdiag_prev[],list cri,db_list cli_c, db_list cli_d, fragment *f) { int max_score, cs; fragment *pf, *cf, *df; int cur_diag, col, diag; cur_diag = pos_diag(f); /* compute the best mismatch */ if (pf = cdiag_prev[cur_diag]) { cs = pf->score - weight(pf, f); max_score = cs; f->bgf = pf->bgf; incre(f->bgf); decre(pf); } else { f->bgf = f; max_score = -1 * weight(pof,f); } cdiag_prev[cur_diag] = f; incre(f); /* compute the best right influence */ pf = rsearch_next(cri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } } /* compute the best left influence */ cf = db_rsearch_next(cli_c, f->j); col = db_cur_key(cli_c); df = db_lsearch_next(cli_d, cur_diag); diag = db_cur_key(cli_d); pf = ((f->j - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; if (f != f->bgf) decre(f->bgf); f->bgf = pf->bgf; incre(f->bgf); } } /* update the current score */ f->score = max_score + DIGIT * f->k; } /* find the best alignment for cut_frag */ calign(fragment *pof, fragment *cdiag_prev[], list cri, db_list cli_c, db_list cli_d, fragment *f) { int max_score, cs; fragment *pf, *cf, *df; int cur_diag, col, diag; cur_diag = pos_diag(f); /* compute the best mismatch */ if (pf = cdiag_prev[cur_diag]) { cs = pf->score - weight(pf, f); max_score = cs; f->bgf = pf; incre(pf); } else { f->bgf = pof; incre(pof); max_score = -1 * weight(pof,f); } incre(f); /* compute the best right influence */ pf = rsearch_next(cri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; decre(f->bgf); f->bgf = pf; incre(pf); } } /* compute the best left influence */ cf = db_rsearch_next(cli_c, f->j); col = db_cur_key(cli_c); df = db_lsearch_next(cli_d, cur_diag); diag = db_cur_key(cli_d); pf = ((f->j - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; decre(f->bgf); f->bgf = pf; incre(pf); } } /* update the current score */ f->score = max_score + DIGIT * f->k; } /* malign - midpoint alignment */ malign(fragment *pof, fragment *cdiag_prev[], list cri, db_list cli_c, db_list cli_d, fragment *f) { int max_score, cs; fragment *pf, *cf, *df; int cur_diag, col, diag; cur_diag = pos_diag(f); f->bgf = pof; /* compute the best mismatch */ if (pf = cdiag_prev[cur_diag]) { cs = pf->score - weight(pf, f); max_score = cs; f->bgf = pf; } else max_score = -1 * weight(pof,f); /* compute the best right influence */ pf = rsearch_next(cri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; f->bgf = pf; } } /* compute the best left influence */ cf = db_rsearch_next(cli_c, f->j); col = db_cur_key(cli_c); df = db_lsearch_next(cli_d, cur_diag); diag = db_cur_key(cli_d); pf = ((f->j - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; f->bgf = pf; } } /* update the current score */ f->score = max_score + DIGIT * f->k; } /* malign1 - midpoint alignment */ malign1(fragment *pof, list cri, db_list cli_c, db_list cli_d, fragment *f) { int max_score, cs; fragment *pf, *cf, *df; int cur_diag, col, diag; cur_diag = pos_diag(f); f->bgf = pof; if (cur_diag == pos_diag(pof)) max_score = MININT; else max_score = -1 * weight(pof,f); /* compute the best right influence */ pf = rsearch_next(cri,cur_diag); if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; f->bgf = pf; } } /* compute the best left influence */ cf = db_rsearch_next(cli_c, f->j); col = db_cur_key(cli_c); df = db_lsearch_next(cli_d, cur_diag); diag = db_cur_key(cli_d); pf = ((f->j - col > cur_diag-diag)&&(diag != -1)) ? df : cf; if (pf) { cs = pf->score - weight(pf, f); if (cs > max_score) { max_score = cs; f->bgf = pf; } } /* update the current score */ f->score = max_score + DIGIT * f->k; } /* tailor - return proper rcopy */ frag_pt *tailor(frag_pt *rcopy, int col) { frag_pt *p, *t1, *t2; if (!rcopy) return NULL; p = rcopy; t1 = NULL; while (p && p->col <= col) { t1 = p; p = p->link; } while (p) { t2 = p->link; free(p); p = t2; } if (t1) { /* rcopy->col <= col */ t1->link = NULL; return rcopy; } else return NULL; } /* otr_survive - reduce the rcopy orthogonally */ frag_pt *otr_survive(frag_pt *rcopy, int col) { frag_pt *p, *q, *r, *t1, *t2; int cur_score; char *ckalloc(); if (!rcopy) return NULL; r = NULL; q = NULL; p = rcopy; while (p) { if ((p->fp)->score-IR*(col-p->col) > 0) { r = (frag_pt *) ckalloc(sizeof(frag_pt)); r->fp = p->fp; r->col = p->col; r->link = q; q = r; } p = p->link; } /* sweep right */ p = r; while (p) { cur_score = (p->fp)->score; for (q = p->link; q && ((cur_score - IE*(p->col-q->col)) > (q->fp)->score - IR*(p->col-q->col)) ; q = q->link); t1 = p->link; while (t1 != q) { t2 = t1->link; free(t1); t1 = t2; } p->link = q; p = q; } return r; } /* rotr_merge - r_merge orthogonally */ rotr_merge(frag_pt *rcopy) { frag_pt *p, *q; fragment *pf, *rf; for (p = rcopy; p; q=p, p=p->link, free(q)) { pf = p->fp; rf = rsearch_next(ri,rcpos_diag(pf)); /* determine if we need to keep pf in cri */ if (!rf || (rf && rotr_win(pf,rf))) rcr_add_point(ri,pf,p->col); } } /* rotr_win - true if pf is better in common right influence area */ int rotr_win(fragment *pf, fragment *rf) { int r1, r2, diag_diff; diag_diff = rcpos_diag(pf) - rcpos_diag(rf); r1 = pf->j + pf->k - 1; r2 = rf->j + rf->k - 1; if (rf->score-IR*(r1-r2)-IE*diag_diff < pf->score) return true; return false; } /* cotr_merge - r_merge orthogonally */ cotr_merge(frag_pt *rcopy) { frag_pt *p, *q; fragment *pf, *rf; for (p = rcopy; p; q=p, p=p->link, free(q)) { pf = p->fp; rf = rsearch_next(bri,pos_diag(pf)); /* determine if we need to keep pf in cri */ if (!rf || (rf && cotr_win(pf,rf))) r_add_point(bri,pf,p->col); } } /* cotr_win - true if pf is better in common right influence area */ int cotr_win(fragment *pf, fragment *rf) { int r1, r2, diag_diff; diag_diff = pos_diag(pf) - pos_diag(rf); r1 = pf->i + pf->k - 1; r2 = rf->i + rf->k - 1; if (rf->score-IR*(r1-r2)-IE*diag_diff < pf->score) return true; return false; } /* otl_survive - reduce the rcopy orthogonally (left influence) */ frag_pt *otl_survive(frag_pt *rcopy, int col) { frag_pt *p, *q, *r, *t; int cur_score,col_diff; fragment *pf; char *ckalloc(); if (!rcopy) return NULL; r = NULL; t = NULL; p = rcopy; while (p) { pf = p->fp; cur_score = pf->score; col_diff = col-p->col; if ((cur_score-(IO+IE*col_diff) > 0) || (cur_score-IR*col_diff> 0)) { r = (frag_pt *) ckalloc(sizeof(frag_pt)); r->fp = p->fp; r->col = p->col; r->link = t; t = r; for (q = p->link; q && ((cur_score - IE*(q->col-p->col)) >= (q->fp)->score) ; q = q->link); p = q; } else p=p->link; } return r; } /* otl_merge - l_merge for orthogonally recomputation */ otl_merge(frag_pt *r, int col, int row, db_list cli_c, db_list cli_d, idb_list cintersect[], int ccol_int[], char cdiag_flag[], int (*cpos_diag)(), int (*otl_win)()) { fragment *df, *cf, *rf, *pf, *nf; int cur_diag, lrow, diag, in_row, next_diag; frag_pt *p; if (!r) return; db_reset_pos(cli_c); db_reset_pos(cli_d); cur_diag = offset1 + (row-col); cf = db_rsearch_next(cli_c, row); lrow = db_cur_key(cli_c); df = db_rsearch_next(cli_d,cur_diag); diag = db_cur_key(cli_d); rf = ((row-lrow > cur_diag-diag) && (diag != -1)) ? df : cf; if ((rf == NULL) && (diag == -1)) { /* cli_c && cli_d are empty */ db_insert_next(cli_c,row,r->fp); incre(r->fp); for (p = r; p; free(p),p=r) { r = p->link; if (r) df = r->fp; else df = NULL; db_insert_next(cli_d,(*cpos_diag)(p->fp),df); incre(df); } return; } if ((*otl_win)(r->fp,rf)) { /* one more column boundary */ db_insert_next(cli_c,row,r->fp); incre(r->fp); if (rf == df) { /* cintersect list is affected */ in_row = row - diag + offset1; if (in_row < cur_end1) { if (!cintersect[in_row]) cintersect[in_row]=idb_newList(); db_insert(cintersect[in_row],row,NULL); ccol_int[row] = in_row; cdiag_flag[diag] = 1; } } } incre(rf); while (r) { pf = r->fp; cur_diag = (*cpos_diag)(pf); p = r->link; if (p) df = p->fp; else df = NULL; next_diag = db_next_key(cli_d); nf = db_next_val(cli_d); if (next_diag == -1) { db_insert_next(cli_d,cur_diag,df); incre(df); free(r); r = p; } else if (next_diag > cur_diag) { if ((*otl_win)(pf,rf)) { cf = ((*otl_win)(df,rf))? df:rf; db_insert_next(cli_d,cur_diag,cf); incre(cf); } free(r); r = p; } else if (next_diag == cur_diag) { decre(rf); rf = db_next(cli_d); incre(rf); if ((*otl_win)(df,nf)) { db_update_node(cli_d,df); decre(nf); incre(df); } free(r); r = p; } else if ((*otl_win)(pf,rf)) { /* next_diag < cur_diag and pf is better */ decre(rf); rf = nf; db_delete_next(cli_d); } else { /* next_diag < cur_diag and rf is better */ decre(rf); rf = db_next(cli_d); incre(rf); if ((*otl_win)(pf,nf)) { db_update_node(cli_d,pf); decre(nf); incre(pf); } } } decre(rf); } /* rotl_win - true if pf is better in common left influence area */ int rotl_win(fragment *pf, fragment *rf) { int c1, c2, diag_diff; if (!rf) return true; if (!pf) return false; diag_diff = rcpos_diag(rf) - rcpos_diag(pf); c1 = pf->i + pf->k - 1; c2 = rf->i + rf->k - 1; if (rf->score-IR*(c1-c2)-IE*diag_diff < pf->score) return true; return false; } /* cotl_win - true if pf is better in common left influence area */ int cotl_win(fragment *pf, fragment *rf) { int c1, c2, diag_diff; if (!rf) return true; if (!pf) return false; diag_diff = pos_diag(rf) - pos_diag(pf); c1 = pf->j + pf->k - 1; c2 = rf->j + rf->k - 1; if (rf->score-IR*(c1-c2)-IE*diag_diff < pf->score) return true; return false; } /* r_survive - reduce the rcopy */ int r_survive(frag_pt *rcopy) { frag_pt *p, *q, *r, *t; int cur_score; p = rcopy; while (p) { cur_score = (p->fp)->score; /* sweep right */ for (q = p->link; q && ((cur_score - IE * (q->col - p->col)) > (q->fp)->score || ((cur_score - IE * (q->col - p->col)) == (q->fp)->score && topo_win(q->fp,p->fp))); q = q->link); r = p->link; while (r != q) { t = r->link; free(r); r = t; } p->link = q; p = q; } } /* r_merge - update the active right influence points */ r_merge(list cri, frag_pt *rcopy, int row) { frag_pt *p, *q; int row_diff, diag_diff; fragment *pf, *rf; for (p = rcopy; p; q = p, p = p->link, free(q)) { pf = p->fp; rf = rsearch_next(cri,pos_diag(pf)); /* determine if we need to keep pf in cri */ if (rf) { row_diff = row - (rf->i + rf->k - 1); diag_diff = diag(pf) - diag(rf); if (pf->score > rf->score-IE*diag_diff-IR * row_diff || (pf->score == rf->score-IE*diag_diff-IR*row_diff && topo_win(rf,pf))) r_add_point(cri,pf,row); } else r_add_point(cri,pf,row); } } /* r_add_point - add point to right influence list cri */ r_add_point(list cri, fragment *f, int row) { fragment *nf; int row_diff,diag_diff; nf = next_val(cri); if (pos_diag(f) == next_key(cri)) { next(cri); if ((f->score > (nf->score) -IR*(row-(nf->i +nf->k -1))) || ((f->score == (nf->score) -IR*(row-(nf->i +nf->k -1))) && topo_win(nf,f))) { update_node(cri,f); decre(nf); incre(f); } } else { insert_next(cri,pos_diag(f),f); incre(f); } nf = next_val(cri); while (nf) { row_diff = row - (nf->i + nf->k -1); diag_diff = diag(nf) - diag(f); if ((nf->score - IR * row_diff < f->score - IE * diag_diff) || ((nf->score - IR * row_diff == f->score - IE * diag_diff) && topo_win(nf,f))) { delete_next(cri); decre(nf); nf = next_val(cri); } else nf = NULL; } } /* resolve - resolve the conflicts of the intersection lists */ resolve(db_list cli_c, db_list cli_d, idb_list cintersect[], char cdiag_flag[], int ccol_int[], int row,int (*cpt_lscore)()) { int col, diag; fragment *cp, *dp, *cf, *df, *nf; int prev_col, prev_diag, next_col, in_row; int d_score, c_score; idb_list inter; if (!cintersect[row]) return; inter = cintersect[row]; idb_reset_pos(inter); for (col = idb_key_next(inter); col != -1; col = idb_key_next(inter)){ diag = col - row + offset; cp = db_rsearch_next(cli_c,col); cf = db_next_val(cli_c); c_score = (*cpt_lscore)(cf,col,diag); dp = db_rsearch_next(cli_d,diag); prev_col = db_cur_key(cli_c); prev_diag = db_cur_key(cli_d); df = ((col - prev_col > diag - prev_diag) && (prev_diag != -1))? dp : cp; d_score = (*cpt_lscore)(df,col,diag); if (c_score > d_score || (c_score == d_score && topo_win(df,cf))) { /* extend the column boundary */ nf = db_next_val(cli_d); db_delete_next(cli_d); decre(nf); cdiag_flag[diag] = 0; in_row = col-prev_diag+offset; ccol_int[col] = 0; if (df == dp && in_row<=cur_end) { /* intersection lists are affected */ INTER_ADD(col); cdiag_flag[prev_diag] = 1; } } else { /* extend the diagonal boundary */ db_delete_next(cli_c); next_col = db_next_key(cli_c); in_row = next_col-diag+offset; if ((next_col != -1) && in_row<=cur_end &&(ccol_int[next_col] == 0)) { INTER_ADD(next_col); cdiag_flag[diag] = 1; } else cdiag_flag[diag] = 0; nf = db_next_val(cli_d); db_next(cli_d); db_update_node(cli_d,cf); decre(nf); ccol_int[col] = 0; } } } /* l_merge - merge lcopy into cli_c & cli_d */ l_merge(db_list cli_c, db_list cli_d, idb_list cintersect[], char cdiag_flag[], int ccol_int[], frag_pt *lcopy, int row, int (*cpt_lscore)()) { int col, diag; int c_col, d_diag, in_row; int next_col, p_score; int prev_diag; fragment *f, *cf, *df, *dp, *nf, *lp; frag_pt *e; for (e = lcopy; e; e = e->link) { col = e->col; f = e->fp; diag = col - row + offset; cf = db_rsearch_next(cli_c,col); c_col = db_cur_key(cli_c); df = db_lsearch_next(cli_d,diag); d_diag = db_cur_key(cli_d); next_col = db_next_key(cli_c); lp = ((col - c_col >= diag - d_diag) && (d_diag != -1)) ? df : cf; if (lp != df) { /* it's owned by the column bound */ if (col != next_col) { /* next_col > col */ p_score = (*cpt_lscore)(lp,col,diag); if (f->score > p_score || (f->score == p_score && topo_win(lp,f))) { in_row = next_col-diag+offset; if ((next_col != -1) && in_row<=cur_end && (ccol_int[next_col] == 0)){ INTER_ADD(next_col); cdiag_flag[diag] = 1; } db_insert_next(cli_d,diag,lp); incre(lp); db_insert_next(cli_c,col,f); incre(f); } } else { /* next_col == col */ nf = db_next(cli_c); p_score = (*cpt_lscore)(nf,col,diag); if (f->score > p_score || (f->score == p_score && topo_win(nf,f))) { db_update_node(cli_c,f); db_insert_next(cli_d,diag,nf); incre(f); /* nf->ref remains the same */ next_col = db_next_key(cli_c); in_row = next_col-diag+offset; if ((next_col != -1) && in_row<=cur_end && (ccol_int[next_col] == 0)) { INTER_ADD(next_col); cdiag_flag[diag] = 1; } } } } else { /* it is owned by the diagonal bound */ if ((col != next_col) && (diag != d_diag)) { p_score = (*cpt_lscore)(df,col,diag); if ((df == NULL) || (f->score > p_score) || (f->score == p_score && topo_win(df,f))){ if (d_diag == -1) { /* it isn't in any active region */ in_row = next_col-diag+offset; if (next_col != -1 && in_row<=cur_end) { INTER_ADD(next_col); cdiag_flag[diag] = 1; } } else if (cdiag_flag[d_diag] && ccol_int[next_col]) { db_delete(cintersect[ccol_int[next_col]],next_col); in_row = col-d_diag+offset; INTER_ADD(col); in_row = next_col-diag+offset; INTER_ADD(next_col); cdiag_flag[diag] = 1; } else { in_row = col-d_diag+offset; if (in_row<=cur_end) { INTER_ADD(col); cdiag_flag[d_diag] = 1; } in_row=next_col-diag+offset; if (next_col != -1 && !ccol_int[next_col] && in_row<=cur_end) { INTER_ADD(next_col); cdiag_flag[diag]=1; } } db_insert_next(cli_c,col,f); incre(f); db_insert_next(cli_d,diag,df); incre(df); } } else if ((col != next_col) && (diag == d_diag)) { dp = db_prev_val(cli_d); prev_diag = db_prev_key(cli_d); df = ((col - c_col >= diag - prev_diag) && (prev_diag != -1))? dp : cf; p_score = (*cpt_lscore)(df,col,diag); if (f->score > p_score || (f->score == p_score && topo_win(df,f))) { db_insert_next(cli_c,col,f); incre(f); in_row = col-prev_diag+offset; if (df == dp && in_row<=cur_end) { INTER_ADD(col); cdiag_flag[prev_diag] = 1; } } } else { /* col == next_col */ nf = db_next(cli_c); p_score = (*cpt_lscore)(nf,col,diag); if (f->score > p_score || (f->score == p_score && topo_win(nf,f))) { db_update_node(cli_c,f); db_insert_next(cli_d,diag,nf); incre(f); next_col = db_next_key(cli_c); in_row = next_col-diag+offset; if ((next_col != -1) && in_row<=cur_end && (ccol_int[next_col] == 0)) { INTER_ADD(next_col); cdiag_flag[diag] = 1; } } } } } } /* rcr_merge - update the active right influence points */ rcr_merge(list cri, frag_pt *rcopy, int row) { frag_pt *p, *q; int row_diff, diag_diff; fragment *pf, *rf; for (p = rcopy; p; q = p, p = p->link, free(q)) { pf = p->fp; rf = rsearch_next(cri,rcpos_diag(pf)); /* determine if we need to keep pf in cri */ if (rf) { row_diff = row - (rf->j + rf->k - 1); diag_diff = rcdiag(pf) - rcdiag(rf); if (pf->score > rf->score-IE*diag_diff-IR * row_diff || (pf->score == rf->score-IE*diag_diff-IR*row_diff && topo_win(rf,pf))) rcr_add_point(cri,pf,row); } else rcr_add_point(cri,pf,row); } } /* rcr_add_point - add point to right influence list cri */ rcr_add_point(list cri, fragment *f, int row) { fragment *nf; int row_diff,diag_diff; nf = next_val(cri); if (rcpos_diag(f) == next_key(cri)) { next(cri); if ((f->score > (nf->score) -IR*(row-(nf->j +nf->k -1))) || ((f->score == (nf->score) -IR*(row-(nf->j +nf->k -1))) && topo_win(nf,f))) { update_node(cri,f); decre(nf); incre(f); } } else { insert_next(cri,rcpos_diag(f),f); incre(f); } nf = next_val(cri); while (nf) { row_diff = row - (nf->j + nf->k -1); diag_diff = rcdiag(nf) - rcdiag(f); if ((nf->score - IR * row_diff < f->score - IE * diag_diff) || ((nf->score - IR * row_diff == f->score - IE * diag_diff) && topo_win(nf,f))) { delete_next(cri); decre(nf); nf = next_val(cri); } else nf = NULL; } } /* topo_win - return true if f->bgf is greater than pf->bgf in the topological order */ int topo_win(fragment *pf, fragment *f) { if (!pf) return(true); if (!f) return(false); if (diag(f->bgf) > diag(pf->bgf)) return(true); if (diag(f->bgf) == diag(pf->bgf) && (f->bgf)->i >= (pf->bgf)->i) return(true); return(false); } /* pt_lscore - compute the fading score of the given fragment (left inf)*/ int pt_lscore(fragment *pf, int col, int diag) { int col_diff, diag_diff; if (!pf) return(MININT); else { col_diff = col - (pf->j + pf->k -1); diag_diff = pos_diag(pf) - diag; return(pf->score -IE*diag_diff-IR*col_diff); } } /* rcpt_lscore - compute the fading score of the given fragment (left inf)*/ int rcpt_lscore(fragment *pf, int col, int diag) { int col_diff, diag_diff; if (!pf) return(MININT); else { col_diff = col - (pf->i + pf->k -1); diag_diff = rcpos_diag(pf) - diag; return(pf->score -IE*diag_diff-IR*col_diff); } } /* weight - compute the cost of the connection of fragments pf and f */ int weight(fragment *pf, fragment *f) { int msi, msj; msi = IR*(f->i - pf->i - pf->k); msj = IR*(f->j - pf->j - pf->k); if (diag(pf) == diag(f)) return(msi); else if (diag(pf) < diag(f)) return(gap_cost(diag(f)-diag(pf)) + msi); else return(gap_cost(diag(pf)-diag(f)) + msj); } /* gap_cost - compute the cost of a gap, given length l */ int gap_cost(int l) { return(IO+IE*l); } /* diag - give the diagonal number for fragment */ int diag(fragment *f) { return(f->j - f->i); } /* rcdiag - give the diagonal number for reversed fragment */ int rcdiag(fragment *f) { return(f->i - f->j); } /* pos_diag - give the positive diagonal number for fragment */ int pos_diag(fragment *f) { return(f->j - f->i + len1); } /* rcpos_diag - give the positive diagonal number for reversed fragment */ int rcpos_diag(fragment *f) { return(f->i - f->j + len2); } /* pf_set - set the coordinate of the given pseudo fragment */ pf_set(fragment *f, int i, int j) { f->i = i; f->j = j; } /* power - raise base to n-th power; n >= 0 */ int power(int base, int n) { int i, p; p = 1; for (i=1; i<=n; ++i) p = p*base; return p; } /* fatal - print message and die */ fatal(char *msg) { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(char *msg, char *val) { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(char *name, char *mode) { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(int amount) { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); } /* decre - decrease the reference count */ decre(fragment *f) { if (f && (--(f->ref) == 0)) { if (f != f->bgf) decre(f->bgf); free(f); ++deleted_frags; } } /* incre - increase the reference count */ incre(fragment *f) { if (f) ++(f->ref); } /* frag_print - print the given fragment */ frag_print(fragment *f) { if (f) { printf("%6d %6d %6d \n",f->i,f->j,f->k); if (!used_frag[f->i]) used_frag[f->i]=unewList(); uinsert(used_frag[f->i],f->j); } ++output_frags; } Xfrag_print(fragment *f) { int len, decr_i, decr_j, decr; if (f) { len = f->k - 1; if (old_i_end) { if (old_j_end - old_i_end == f->j - f->i) { old_i_end = f->i + len; old_j_end = f->j + len; } else { decr_i = f->i - old_i_end; decr_j = f->j - old_j_end; decr = (decr_i < decr_j) ? decr_i : decr_j; --decr; printf(" l %d %d %d %d 0.0\n", old_i_start, old_j_start, old_i_end, old_j_end); old_i_start = f->i - decr; old_i_end = f->i + len; old_j_start = f->j - decr; old_j_end = f->j + len; } } else { old_i_start = f->i; old_i_end = f->i + len; old_j_start = f->j; old_j_end = f->j + len; } if (!used_frag[f->i]) used_frag[f->i]=unewList(); uinsert(used_frag[f->i],f->j); } ++output_frags; } /* strsame - tell whether two strings are identical */ int strsame(char *s, char *t) { return(strcmp(s, t) == 0); } /* strsave - save string s somewhere; return address */ char *strsave(char *s) { char *ckalloc(), *p, *strcpy(); p = ckalloc(strlen(s)+1); /* +1 to hold '\0' */ return(strcpy(p, s)); }