#define fname "alignment.aln" typedef struct ce { struct clump_node *A, *B; struct cedgelist *Asrc; struct cedgelist *Bsrc; char dirA, dirB; } cedge; typedef struct cedgelist { struct cedgelist *next; cedge *edge; } c_elist; typedef struct clump_node { int size, visit; FRAGMENT *fr[100]; struct clump_node *last; int i; c_elist *inedges, *outedges; } CLUMP_NODE; typedef struct bl { int a, b; int len; struct bl *next; } block; typedef struct al { char *col; struct al *succ, *prec; } ALIGN; #define min(x,y) (x < y) ? x : y; FRAGMENT *fl, *cl; static int DIR; void lav_read_init(); char *get_seqname(); void build_clump_edges(int which, FRAGMENT *g, CLUMP *cvec, int j, CLUMP_NODE *cl) { register ELIST *m, **t; register EDGE *e; int i, st; cedge *ce; c_elist *clist1, *clist2; if (which) if (g->indegree >= 0) { t = &(g->inedges); which = IN; } else { t = &(g->outedges); which = OUT; } else if (g->outdegree >= 0) { t = &(g->outedges); which = OUT; } else { t = &(g->inedges); which = IN; } m = *t; if (m != NULL) { do { e = m->edge; if (e->A == g) { i = ((CLUMP *) e->B->attrib) - cvec; st = e->dirB; } else { i = ((CLUMP *) e->A->attrib) - cvec; st = e->dirA; } if (i < j) {m = m->esucc; continue;} ce = (cedge *) malloc(sizeof(cedge)); clist1 = (c_elist *) malloc(sizeof(c_elist)); clist2 = (c_elist *) malloc(sizeof(c_elist)); clist1->edge = clist2->edge = ce; ce->Asrc = clist1; ce->Bsrc = clist2; ce->A = &cl[i]; ce->B = &cl[j]; ce->dirA = st; ce->dirB = which; if (which == IN) { clist2->next = cl[j].inedges; cl[j].inedges = clist2; } else { clist1->next = cl[j].outedges; cl[j].outedges = clist1; } if (st == OUT) { clist1->next = cl[i].outedges; cl[i].outedges = clist1; } else { clist2->next = cl[i].inedges; cl[i].inedges = clist2; } m = m->esucc; } while (m != *t); } } build_clump_graph(int nclumps, CLUMP *cvec, CLUMP_NODE **clv) { register FRAGMENT *g; register int dir, i, j; register EDGE *e; CLUMP_NODE *cn; register FRAGMENT **f; cn = (CLUMP_NODE *) malloc(sizeof(CLUMP_NODE)*nclumps); for (i = 0; i < nclumps; i++) { g = cvec[i].end1; f = cn[i].fr; j = 0; cn[i].inedges = cn[i].outedges = NULL; cn[i].visit = 0; f[j++] = g; if (g->indegree >= 0) dir = IN; else dir = OUT; while (1) { if (dir == OUT) { if (g->indegree >= 0) break; e = g->inedges->edge; } else { if (g->outdegree >= 0) break; e = g->outedges->edge; } if (e->A == g) { dir = e->dirB; g = e->B; } else { dir = e->dirA; g = e->A; } f[j++] = g; } cn[i].size = j; } for (i = 0; i < nclumps; i++) { build_clump_edges(0, cvec[i].end1, cvec, i, cn); build_clump_edges(1, cvec[i].end2, cvec, i, cn); } *clv = cn; } char *seq, *get_seq(); static char complement[]="TVGH CD M KN YSA BWXR"; char com(char a) { return complement[a-'A']; } char **stack; int stack_top; void reverse_seq(char *seq) { int i = strlen(seq); int j, k; char c; for (j = 0, k = i-1; j < k; j++, k--) { c = com(seq[j]); seq[j] = com(seq[k]); seq[k] = c; } if (j == k) seq[j] = com(seq[j]); stack[stack_top++] = seq; } void roll_back() { int i, j = stack_top; stack_top = 0; for (i = 0; i < j; i++) reverse_seq(stack[i]); } int is_rev(char *seq) { int i; for (i = 0; i < stack_top; i++) if (seq == stack[i]) return 1; return 0; } EDGE *edge_of(FRAGMENT *f1, FRAGMENT *f2) { ELIST *el; EDGE *e; el = f2->inedges; if (el) do { e = el->edge; if (e->A == f1 || e->B == f1) return e; el = el->esucc; } while (el != f2->inedges); el = f1->outedges; if (el) do { e = el->edge; if (e->A == f2 || e->B == f2) return e; el = el->esucc; } while (el != f1->outedges); el = f1->inedges; if (el) do { e = el->edge; if (e->A == f2 || e->B == f2) return e; el = el->esucc; } while (el != f1->inedges); el = f2->outedges; if (el) do { e = el->edge; if (e->A == f1 || e->B == f1) return e; el = el->esucc; } while (el != f2->outedges); return NULL; } void get_endpoint(EDGE *e, FRAGMENT *f1, FRAGMENT *f2, char *seq, char *s2, int *b1, int *b2, int *e1, int *e2, int *r) /* assuming f1 before f2*/ { int i1, i2, i, j; i1 = strlen(seq); i2 = strlen(s2); /* printf("e->A=%c e->B=%c e->coord1=%d e->coord2=%d e->orient=%c e->otype=%c\n", ((e->A==f1)? '1':'2'), ((e->B==f1)? '1':'2'), e->coord1, e->coord2, ((e->orient==FA_SAME)? 'S':'R'), ((e->otype==FA_CONTAIN)?'C':'D')); */ if (e->otype == FA_CONTAIN) { if (is_rev(seq)) { if (e->orient == FA_SAME) *r = 1; else *r = 0; *b1 = i1-e->coord2; *e1 = i1 - e->coord1; } else { if (e->orient != FA_SAME) *r = 1; else *r = 0; *b1 = e->coord1; *e1 = e->coord2; } *b2 = 1; *e2 = strlen(s2); return; } i = e->coord1; j = e->coord2; *e1 = i1; *b2 = 1; if (is_rev(seq)) { /*printf("1\n");*/ if (e->orient == FA_SAME) { *r = 1; *b1 = i1-abs(j); *e2 = i2-abs(i); } else { *r = 0; if (e->A == f1) { *b1 = i1-j; *e2 = i2+i;} else { *b1 = -i; *e2 = j; } } } else { /*printf("2\n");*/ if (e->orient == FA_SAME) { *r = 0; *b1 = abs(i); *e2 = abs(j); } else { *r = 1; if (e->A == f1) { *b1 = i; *e2 = -j;} else { *b1 = i1+j;*e2 = i2-i;} } } } void extending(block *b1, block *b2, int j1, int j2) { int x; char *ckalloc(); block *b; x = b1->b-1; b1->a -= x; b1->len += x; b1->b = 1; if (j1 - b2->a < j2 - b2->b) { b2->len = j1-b2->a+1; } else { b2->len = j2-b2->b+1; } b = (block *) ckalloc(sizeof(block)); b->a = j1+1; b->b = j2+1; b->len = 0; b2->next = b; b->next = NULL; } block *guess(int b1, int b2, int i1, int i2) { block *b, *c; b = (block *) malloc(sizeof(block)); b->b = 1; b->a = b1-b2+1; b->len = min(i1 - b->a+1, i2); c = b->next = (block *) malloc(sizeof(block)); c->a = i1+1, c->b = i2 +1; c->len = 0; c->next = NULL; return b; } FRAGMENT **flist; block *sim(FRAGMENT *fr1, FRAGMENT *fr2, int *n) { static char a[] = "../code.d/band1 .versa.hsp .versa.seq > sim.out"; char *s; FILE *fp; block *lp, *bp, *bbp; EDGE *e; int f1, f2, b1, b2, e1, e2, r1, r2, i1, i, i2; if ((e = edge_of(fr1, fr2)) == NULL) { for (i = 0; 1; i++) if ((e=edge_of(flist[i], fr2))) break; *n = i; fr1 = flist[i]; } f1 = fr1->extid; f2 = fr2->extid; s = get_seq(f1); seq = get_seq(f2); /*printf("f1=%d f2=%d\n", f1, f2);*/ i1 = strlen(s); i2 = strlen(seq); get_endpoint(e, fr1, fr2, s, seq, &b1, &b2, &e1, &e2, &r1); if (r1) reverse_seq(seq); fp = fopen(".versa.seq", "w"); /*printf("%s\n", s);*/ fprintf(fp, "> %d\n", f1); fprintf(fp, "%s\n", s); fprintf(fp, "> %d\n%s\n", f2, seq); fclose(fp); fp = fopen(".versa.hsp", "w"); fprintf(fp, "%d %d 500 p(%d/%d/%d/%d)\n", f1, f2, b1, b2, e1-b1+1, e2-b2+1); fclose(fp); system(a); fp = fopen("sim.out", "r"); lav_read_init(fp); if (!get_block(&bp)) { fclose(fp); return guess(b1, b2, i1, i2);} lp = bp; while (get_block(&bbp)) { lp->next = bbp; lp = bbp; } fclose(fp); extending(bp, lp, i1, i2); return bp; } ALIGN **ap; insert(ALIGN **head, ALIGN *p, ALIGN *q, int nn, int n, char s) { ALIGN *a; int i; char *ckalloc(); a = (ALIGN *) ckalloc(sizeof(ALIGN)); a->col = (char *) ckalloc(n); for (i = 0; i < nn; i++) a->col[i] = '-'; for (i = nn+1; i < n; i++) a->col[i] = ' '; a->col[nn] = s; a->succ = q; a->prec = p; p->succ = a; q->prec = a; /*if (q == *head) *head = a;*/ } combine(ALIGN **a, block *b, int n1, int nn, int n) { ALIGN *aa; block *bp, *blp; int i,j; aa = ap[n1]; /*printf("combine=%d %d %d\n", n1, nn, n);*/ for (blp = NULL, bp = b; bp; blp = bp, bp = bp->next) { if (blp) { if (blp->a + blp->len == bp->a) { for (i = blp->b+blp->len; i < bp->b; i++) if (aa != *a && aa->col[n1] == '-') { aa->col[nn] = seq[i-1]; aa = aa->succ; } else insert(a, aa->prec, aa, nn, n, seq[i-1]); } else { for (i = blp->a+blp->len; i < bp->a; ) { aa->col[nn] = '-'; if (aa->col[n1] != '-') i++; aa = aa->succ; } } free(blp); } else { for (i = 1; i < bp->a; ) { if (aa->col[n1] != '-') i++; aa = aa->succ; } ap[nn] = aa; } for (i = bp->a, j = bp->b; i < bp->a + bp->len;) { if (aa->col[n1] == '-') aa->col[nn] = '-'; else { aa->col[nn] = seq[j-1]; i++; j++; } aa = aa->succ; } } free(blp); } int order_fr(CLUMP_NODE *cp) { EDGE *e; FRAGMENT *f1, *f2; char *seq; int i, dir; if (cp->size == 1) { if (DIR == IN) { seq = get_seq(cp->fr[0]->extid); reverse_seq(seq); } return 0; } if (DIR == OUT) { if (cp->fr[0]->inedges == NULL) return 0; if (cp->fr[cp->size-1]->inedges == NULL) return 1; } dir = IN; for (i = 1; i < cp->size; i++) { e = edge_of(cp->fr[i-1], cp->fr[i]); if (e->orient != FA_SAME) dir *= -1; } f1 = cp->fr[i-1]; if (dir == IN) { if (e->orient == FA_SAME) { if (e->A == cp->fr[i-1]) { if (DIR == IN) { seq = get_seq(cp->fr[0]->extid); reverse_seq(seq); return 0; } else return 1; } else { if (DIR == IN) { seq = get_seq(f1->extid); reverse_seq(seq); return 1; } else return 0; } } else if (e->coord1 < 0) if (DIR == OUT) return 0; else {seq = get_seq(f1->extid); reverse_seq(seq); return 1;} else if (DIR == OUT) return 1; else {seq = get_seq(cp->fr[0]->extid); reverse_seq(seq); return 0;} } if (e->orient == FA_SAME) { if (DIR == OUT && e->A == f1) return 1; if (DIR == IN && e->A != f1) return 1; } else { if (DIR == OUT && e->coord2 < 0) return 1; if (DIR == IN && e->coord1 < 0) return 1; } if (cp->last == NULL ) i = 1; else if ((e=edge_of(f1, cp->last->fr[0]))==NULL) { e = edge_of((f2=cp->fr[0]), cp->last->fr[0]); if (!e) i = 1; else { if (e->orient == FA_SAME) { if (e->A == f2) i = 1; else i = 0; } else { if (e->coord1 < 0) i = 0; else i = 1; } } } else { if (e->orient == FA_SAME) { if (e->A == f1) i = 0; else i = 1; } else { if (e->coord1 < 0) i = 1; else i = 0; } } if (DIR == OUT) { if (i == 1) { seq = get_seq(f1->extid); reverse_seq(seq); return 1; } seq = get_seq(cp->fr[0]->extid); reverse_seq(seq); return 0; } if (i == 0) return 1; return 0; } int get_pair(CLUMP_NODE *clump, CLUMP_NODE *cp, FRAGMENT **f1, FRAGMENT **f2, int *n1, int nn,int n_fr, ALIGN **a) { static int dir , i, in_or_de; static CLUMP_NODE *cold = NULL; static FRAGMENT *last_f2; char *s, *t; FRAGMENT *f; EDGE *e; ELIST *el; int j; block *b; ALIGN *aa; if (nn == 1 && (!cold)) { in_or_de = order_fr(cp); if (in_or_de) i = cp->size-1; else i= 0; s = get_seq(cp->fr[i]->extid); *a = (ALIGN *) malloc(sizeof(ALIGN)); aa = *a; aa->col = (char *) malloc(n_fr); for (j = 1; j < n_fr; j++) aa->col[j] = ' '; aa->col[0] = *s; for (t = s+1; *t; t++) { aa->succ = (ALIGN *) malloc(sizeof(ALIGN)); aa->succ->prec = aa; aa = aa->succ; aa->col = (char *) malloc(n_fr); for (j = 1; j < n_fr; j++) aa->col[j] = ' '; aa->col[0] = *t; } aa->succ = *a; ap[0] = *a; (*a)->prec = aa; cold = cp; dir = OUT; flist[0] = cp->fr[i]; last_f2 = cp->fr[i]; } if (cold == cp) { *f1 = cp->fr[i]; if (in_or_de) { --i; if (i < 0) {if (!cp->last) cold = NULL;return 0;} *f2 = cp->fr[i]; el = (*f2)->inedges; j = 1; if (el) { e = el->edge; j = 0; if (e->A == *f1) dir = -e->dirA; else if (e->B == *f1) dir = -e->dirB; else j = 1; } if (j) { el = (*f2)->outedges; if (!el) fatal("Two continuous fragmenets not adjacent"); e = el->edge; if (e->A == *f1) dir = -e->dirA; else dir = -e->dirB; } } else { ++i; if (i >= cp->size) {if (!cp->last) cold = NULL;return 0;} *f2 = cp->fr[i]; if (dir == IN) { if ((*f1)->inedges) e = (*f1)->inedges->edge; else e = (*f1)->outedges->edge; } else { if ((*f1)->outedges) e = (*f1)->outedges->edge; else e = (*f1)->inedges->edge; } if (e->A == *f1) dir = -e->dirB; else dir = -e->dirA; } last_f2 = *f2; *n1 = nn-1; flist[nn] = last_f2; return 1; } cold = cp; if (edge_of(last_f2, cp->fr[0])) { in_or_de = 0; i = 0; } else { in_or_de = 1; i = cp->size-1; } /* if (dir == OUT) { if (cp->fr[0]->indegree >= 0) { in_or_de = 0; i = 0; } else { in_or_de = 1; i = cp->size-1; } } else { if (cp->fr[0]->outdegree >= 0) { in_or_de = 0; i = 0; } else { in_or_de = 1; i = cp->size-1; } } */ *f2 = cp->fr[i]; *f1 = last_f2; *n1 = nn-1; flist[nn] = last_f2 = *f2; return 1; } int member(FRAGMENT *f, CLUMP_NODE *clump) { CLUMP_NODE *cp; int i; for (cp = clump; cp; cp = cp->last) { for (i = 0; i < cp->size; i++) { if (f == cp->fr[i]) return 1; } } return 0; } FRAGMENT *contained(FRAGMENT *f, CLUMP_NODE *clump) { ELIST *l; EDGE *e; l = f->inedges; if (!l) return NULL; do { e = l->edge; if (e->otype == FA_CONTAIN) { if (member(e->A, clump)) return e->A; } l = l->esucc; } while (l != f->inedges); return NULL; } int num_frag(CLUMP_NODE *clump) { FRAGMENT *fp; CLUMP_NODE *cp; int i = 0; for (cp = clump; cp; cp = cp->last) i += cp->size; for (fp = cl; fp; fp = fp->fpred) if (contained(fp, clump)) i++; return i; } int order(FRAGMENT *f) { int i = 0; while (1) { if (f == flist[i]) return i; i++; } } int A, C, G, T, D, CO; char CH; void clear() { A = C= G = T = D = CO = 0; CH ='N'; } void add_one(char a) { int x; if (a ==' ') return; if (a == 'A') { x=++A; } else if (a == 'C') { x=++C; } else if (a == 'G') { x = ++G; } else if (a == 'T') { x = ++T; } else x = ++D; if (x > CO) { CO = x; CH = a;} } char con() { return CH; } void remove_start_enddash(ALIGN *al, int n) { ALIGN *a; char state[1000]; int j, i, k; for (k = 0; k < 2; k++) { j = 0; a = (k == 0) ? al->prec : al; for (i = 0; i < n; i++) state[i] = 0; while (1) { for (i = 0; i < n; i++) { if (state[i]) continue; if (a->col[i] == '-' || a->col[i] == ' ') a->col[i] = ' '; else { j++; state[i] = 1;} } if (j == n) break; a = (k == 0) ? a->prec : a->succ; } } } char all_dash[] = "--------------------------------------------------------------------------------"; print_align(ALIGN *al, int n) { char line[1000][81], cons[81]; int i, j; ALIGN *a, *b; remove_start_enddash(al, n); i = 0; a = al; cons[80] = '\0'; do { clear(); for (j = 0; j < n; j++) { line[j][i] = a->col[j]; add_one(a->col[j]); } cons[i] = con(); i++; if (i == 80) { for (j = 0; j < n; j++) { line[j][80] = '\0'; printf("%s\n", line[j]); } i = 0; printf(all_dash); printf("\n\n\n\n"); /*printf("%s\n\n", cons);*/ } b = a; a = a->succ; free(b->col); if (b != al) free(b); } while (a != al); for (j = 0; j < n; j++) { line[j][i] = '\0'; printf("%s\n", line[j]); } cons[i] = '\0'; printf("%s\n%s\n", all_dash, all_dash); free(al); } void produce_align(CLUMP_NODE *clump) { int n, i, n1, nn; ALIGN *a; block *b; CLUMP_NODE *cp; FRAGMENT *fp, *f1, *f2, *f; char *ckalloc(); n = num_frag(clump); /*printf("n = %d\n", n);*/ ap = (ALIGN **) ckalloc(sizeof(ALIGN *)*n); flist = (FRAGMENT **) ckalloc(n*sizeof(FRAGMENT *)); stack = (char **) ckalloc(n*sizeof(char *)); a = NULL; nn = 1; stack_top = 0; for (cp = clump; cp; cp = cp->last) { /*printf("cp->fr[0]->extid=%d\n", cp->fr[0]->extid);*/ while (get_pair(clump, cp, &f1, &f2, &n1, nn, n, &a)) { b = sim(f1, f2, &n1); combine(&a, b, n1, nn++, n); } } /* treat contained fragment */ for (fp = cl; fp; fp = fp->fpred) { if (fp->extid < 0) continue; if ((f = contained(fp, clump)) == NULL) continue; /*printf("%d fp->extid=%d\n", nn, fp->extid);*/ n1 = order(f); b = sim(f, fp, &n1); flist[nn] = fp; combine(&a, b, n1, nn++, n); fp->extid *= -1; } if (nn >= mini_size) for (i = 0; i < nn; i++) printf("%% (%c) > %d len=%d\n", (is_rev(get_seq(abs(flist[i]->extid))))? '-':'+', abs(flist[i]->extid), flist[i]->length); roll_back(); free(stack); if (nn >= mini_size) { print_align(a, nn); fprintf(stderr, " %d sequences in contig %d\n", nn, num_contig++); } free(flist); free(ap); } CLUMP_NODE *reverse(CLUMP_NODE *clump) { CLUMP_NODE *cp, *cq, *cl; cp = clump; cl = NULL; do { cq = cp->last; cp->last = cl; cl = cp; cp = cq; } while (cp); return cl; } void path(CLUMP_NODE *clump, int dir) { c_elist *ep, *ep1; CLUMP_NODE *cp; clump->visit = 1; if (dir == IN) ep1 = clump->inedges; else ep1 = clump->outedges; for (ep = ep1; ep; ep = ep->next) { if (ep->edge->A == clump) { cp = ep->edge->B; dir = -ep->edge->dirB; } else { cp = ep->edge->A; dir = -ep->edge->dirA; } if (cp->visit) continue; cp->last = clump; path(cp, dir); return; } clump = reverse(clump); produce_align(clump); clump = reverse(clump); } enum_path(int num, CLUMP_NODE *clvec) { int i; FRAGMENT *fp; for (i = 0; i < num; i++) { if (clvec[i].visit) continue; if (clvec[i].inedges == NULL) { clvec[i].last = NULL; DIR = OUT; path(&clvec[i], OUT); } else { if (clvec[i].outedges == NULL) { clvec[i].last = NULL; DIR = IN; path(&clvec[i], IN); } } } DIR = OUT; for (i = 0; i < num; i++) if (clvec[i].visit == 0) { clvec[i].last = NULL; path(&clvec[i], OUT); } for (fp = cl; fp; fp = fp->fpred) fp->extid = abs(fp->extid); } free_seq() { FRAGMENT *f; f = fl; do { free(get_seq(f->extid)); f = f->fsucc; } while (f != fl); } align(int num_clumps, CLUMP *cvec, FRAGMENT *flist, FRAGMENT *clist, int edges) { void get_seqs(); CLUMP_NODE *clvec; get_seqs(seq_file); fl = flist; cl = clist; /*if (edges > 0) {*/ /*printf("num_of_clump=%d\n", num_clumps);*/ build_clump_graph(num_clumps, cvec, &clvec); enum_path(num_clumps, clvec); /*}*/ free_seq(); } FILE *lav_fp; int seq_no, first_line; void lav_read_init(FILE *fp) { lav_fp = fp; seq_no = 0; first_line = 1; } /* get_block - get a block from an alignment file */ int get_block(block **bl) { char buf[200], tbuf[200], *s, *fgets(); char *strtok(), *strncpy(); int i, j, atoi(); block *bp; char lflag; lflag = 0; if (!fgets(buf, 200, lav_fp)) return 0; *bl = bp = (block *) malloc(sizeof(block)); sscanf(buf, "%d/%d/%d", &bp->a, &bp->b, &bp->len); bp->a++; bp->b++; return 1; } #define HASHSIZE 10001 struct node { int id; int len; char *s; struct node *next; } *hashtab[HASHSIZE]; #define MAX_SEQLEN 100000 int hashval(int n) { return n % HASHSIZE; } char *get_seq(int id) { struct node *n; for (n = hashtab[hashval(id)]; n; n = n->next) if (n->id == id) break; if (!n) fatalf("Could not find sequence with id %d", id); return n->s; } int getid(char *s) { if (*s != '>') return 0; return atoi(s+2); } void get_seqs(char *file) { FILE *fp, *ckopen(); char line[1000], buf[1000+MAX_SEQLEN], *p, *q, *strsave(), *ckalloc(); int more, h; struct node *new; static first = 1; if (!first) return; first = 0; fp = ckopen(file, "r"); if (fgets(line, 1000, fp) == NULL || line[0] != '>') fatalf("%s not a file of sequences", file); for (more = 1; more; ) { new = (struct node *)ckalloc(sizeof(*new)); new->id = getid(line); p = buf; for ( ; ; ) { if (fgets(line, 1000, fp) == NULL) { more = 0; break; } if (line[0] == '>') break; for (q = line; *q && *q != '\n'; ++q) ; *q = '\0'; strcpy(p, line); p += strlen(p); if (p - buf >= MAX_SEQLEN) fatalf("Sequence %d is too long.", new->id); } new->len = p - buf; new->s = strsave(buf); h = hashval(new->id); new->next = hashtab[h]; hashtab[h] = new; } }