/* * BLAST - Search two DNA sequences for locally maximal segment pairs. The basic * command syntax is * * BLAST sequence1 sequence2 * * where sequence1 and sequence2 name files containing DNA sequences. Lines * at the beginnings of the files that don't start with 'A', 'C', 'T' or 'G' * are discarded. Thus a typical sequence file might begin: * * >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end. * GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT * GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC * * If sequence1 and sequence2 are identical file names, then matches are * computed without "mirror image" pairs and without the trivial long match. * * The BLAST command permits optional additional arguments, such as X=50, * that reset certain internal parameters. The available parameters are: * * M or m gives the score for a match. * I or i gives the score for a transition (A <--> G, C <--> T). * V or v gives the score for a tranversion (non-transition change). * W or w gives the word size. * X or x gives the value for terminating word extensions. * K or k give the cutoff. * (defaults: M = 1, I = -1, V = -1, W = 8, X = 10*M, * and K = 5% significance level) * * If W is set to 0, then all locally maximum segment pairs (LMSPs) are * computed; in this case the value of X is irrelevant. * * In addition, the word "noalign" requests that no alignments be printed * (summary statistics for each locally maximal segment pair are reported) * and the word "stats" requests that statistics concerning the performance * of BLAST be printed. Thus a typical command is * * BLAST SEQ1 SEQ2 M=1 I=0 V=-1 noalign */ #include #include #include #define max(x,y) ((x > y) ? (x) : (y)) #define min(x,y) ((x < y) ? (x) : (y)) #define SUBSTITUTION_SCORE(x,y) S[x][y] #define DEFAULT_M 1 /* default score for match */ #define DEFAULT_I -1 /* default score for transition */ #define DEFAULT_V -1 /* default score for transversion */ #define DEFAULT_W 8 /* default word size */ #define DEFAULT_SIG 0.05 /* default significance level for setting K */ char S[128][128], *s1, *seq1, *s2, *seq2; int alignments, print_stats, K, /* cutoff score */ M, /* score for a match */ I, /* score for a transition */ V, /* score for a transversion */ W, /* word length */ X, /* cutoff for hit extensions */ sig99, sig90, sig50, /* number of MPSs above given significance */ numMSPs, /* total number of MSPs recorded */ numhits, /* number of word hits */ missW[15][3], missX[25][3]; double param_K, param_lambda; long *diag_lev, /* extent of discovered matches on a given diagonal */ len1, len2; /* sequence lengths */ typedef struct msp { long len, pos1, pos2; int score; struct msp *next_msp; } Msp, *Msp_ptr; Msp_ptr msp_list, *msp; main(argc, argv) int argc; char *argv[]; { int i, v; double x, log(), sqrt(), significance(); char *ckalloc(); if (argc < 3) fatalf("%s seq1 seq2 [M=] [I=] [V=] [K=] [W=] [X=] [noalign] [stats]", argv[0]); M = DEFAULT_M; I = DEFAULT_I; V = DEFAULT_V; W = DEFAULT_W; X = -1; alignments = 1; print_stats = 0; while (--argc > 2) { if (strcmp(argv[argc],"noalign") == 0) { alignments = 0; continue; } if (strcmp(argv[argc],"stats") == 0) { print_stats = 1; continue; } if (argv[argc][1] != '=') fatalf("argument %d has improper form", argc); v = atoi(argv[argc] + 2); switch (argv[argc][0]) { case 'M': case 'm': M = v; break; case 'I': case 'i': I = v; break; case 'V': case 'v': V = v; break; case 'K': case 'k': K = v; break; case 'W': case 'w': W = v; break; case 'X': case 'x': X = v; break; default: fatal("Options are M, I, V, K, W, X."); } } if (X == -1) X = 10*M; if (V > I) fatal("transversions score higher than transitions?"); len1 = get_seq(argv[1], &seq1); if (strcmp(argv[1],argv[2]) == 0) { if (W == 0) fatal("file1 = file2 not implemented when W=0"); seq2 = seq1; len2 = len1; } else len2 = get_seq(argv[2], &seq2); s1 = seq1 - 1; /* so subscripts start with 1 */ s2 = seq2 - 1; diag_lev = (long *)ckalloc((len1+len2+1)*sizeof(long)) + len1; /* set up scoring matrix */ S['A']['A'] = S['C']['C'] = S['G']['G'] = S['T']['T'] = M; S['A']['G'] = S['G']['A'] = S['C']['T'] = S['T']['C'] = I; S['A']['C'] = S['A']['T'] = S['C']['A'] = S['C']['G'] = S['G']['C'] = S['G']['T'] = S['T']['A'] = S['T']['G'] = V; get_params(); if (K == 0) { x = log(DEFAULT_SIG)/(-param_K*len1*len2); K = 2*log(sqrt(x))/(-param_lambda); while (significance(K-1) >= DEFAULT_SIG) --K; while (significance(K) < DEFAULT_SIG) ++K; } if (alignments) { printf("#:lav\n\n"); printf("d {\n\"BLAST output with parameters:\n"); printf("M = %d, I = %d, V = %d\n", M, I, V); if (W > 0) printf("W = %d, X = %d\"", W, X); else printf("W = 0 (computes all LMSPs)\""); printf("\n}\n"); printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n", argv[1], len1, argv[2], len2); printf("k {\n \"significance\"\n}\n"); } if (W > 0) { bld_table(); search(); } else get_msps(); sort_msps(); if (!alignments) printf("\n\n Seq. 1 Seq. 2 len score signif W X\n"); /* print the matches for each sequence */ for (i = 0; i < numMSPs; ++i) print_msp(i); if (W == 0 && print_stats) { printf("\n%3d MSPs above significance level 0.99\n", sig99); printf("%3d MSPs above significance level 0.90\n", sig90); printf("%3d MSPs above significance level 0.50\n", sig50); printf("\nPercent of MSPs missed by BLAST at various"); printf(" W and significance levels:\n"); printf(" W: 0.99 0.90 0.50\n"); for (i = 4; i <= 12; ++i) printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n", i, 100*missW[i][0]/((float)sig99), 100*missW[i][1]/((float)sig90), 100*missW[i][2]/((float)sig50)); printf("\nPercent of MSPs missed by BLAST at various"); printf(" X and significance levels:\n"); printf(" X: 0.99 0.90 0.50\n"); for (i = 1; i <= 20; ++i) printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n", i, 100*missX[i][0]/((float)sig99), 100*missX[i][1]/((float)sig90), 100*missX[i][2]/((float)sig50)); } if (W > 0 && print_stats) { printf("\n\nStatistics:\n"); printf(" %s: M = %d, I = %d, V = %d, K = %d, W = %d, X = %d\n", argv[0], M, I, V, K, W, X); printf(" # of word hits = %d\n", numhits); printf(" # of matches found = %d\n", numMSPs); } if (strcmp(argv[1], argv[2]) == 0) { /* self-comparison; insert trivial diagonal */ printf("a {\n s 0.0\n b 1 1\n e %d %d\n", len1, len1); printf(" l 1 1 %d %d 0.0\n}\n", len1, len1); } exit (0); } /* get_seq - read a sequence */ int get_seq(file_name, seqptr) char *file_name, **seqptr; { FILE *qp, *ckopen(); char *p, *fgets(), *strchr(), *ckalloc(); int c; long n; qp = ckopen(file_name, "r"); for (n = 0; (c = getc(qp)) != EOF; ) if (c != '\n') ++n; *seqptr = ckalloc(n+1); rewind(qp); if (n == 0) return 0; for (p = *seqptr; ; ) { if (fgets(p, (int)n+1, qp) == NULL) { fclose(qp); *p = '\0'; break; } if (p == *seqptr && strchr("ACTG", *p) == NULL) continue; while (isupper(*p)) ++p; if (*p != '\n' && *p != '\0') fatalf("Illegal character %c in query sequence.", *p); } return p - *seqptr; } /* ------------- build table of W-tuples in the first sequence ------------*/ /* The data structure could be simplified if we limited W to, say, W <= 8. */ struct hash_node { long ecode; /* integer encoding of the word */ int pos; /* positions where word hits query sequence */ struct hash_node *link; /* next word with same last 7.5 letters */ }; #define HASH_SIZE 32767 /* 2**15 - 1 */ struct hash_node *hashtab[HASH_SIZE+1]; int encoding[128]; int mask; long *next_pos; bld_table() { long ecode; int i; char *q, *ckalloc(); /* perform initializations */ encoding['C'] = 1; encoding['G'] = 2; encoding['T'] = 3; mask = (1 << (W+W-2)) - 1; next_pos = (long *)ckalloc(len1*sizeof(long)); q = seq1 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++q]; while (*++q) { ecode = ((ecode & mask) << 2) + encoding[*q]; add_word(ecode, (long)(q - seq1 +1)); } } /* add_word - add a word to the table of critical words */ add_word(ecode, pos) long ecode, pos; { struct hash_node *h; long hval; char *ckalloc(); hval = ecode & HASH_SIZE; for (h = hashtab[hval]; h; h = h->link) if (h->ecode == ecode) break; if (h == NULL) { h = (struct hash_node *) ckalloc (sizeof(struct hash_node)); h->link = hashtab[hval]; hashtab[hval] = h; h->ecode = ecode; h->pos = -1; } next_pos[pos] = h->pos; h->pos = pos; } /* ----------------------- search the second sequence --------------------*/ search() { register struct hash_node *h; register char *s; register long ecode, hval; int i; long p; s = seq2 - 1; ecode = 0L; for (i = 1; i < W; ++i) ecode = (ecode << 2) + encoding[*++s]; while (*++s) { ecode = ((ecode & mask) << 2) + encoding[*s]; hval = ecode & HASH_SIZE; for (h = hashtab[hval]; h; h = h->link) if (h->ecode == ecode) { for (p = h->pos; p >= 0; p = next_pos[p]) extend_hit(p, (long)(s+1-seq2)); break; } } } /* extend_hit - extend a word-sized hit to a longer match */ extend_hit(pos1, pos2) long pos1, pos2; { char *beg2, *beg1, *end1, *q, *s; long min_sum, max_sum, sum, diag, score; Msp_ptr mp; if (seq1 == seq2 && pos1 >= pos2) return; numhits++; diag = pos2 - pos1; if (diag_lev[diag] > pos1) return; /* extend to the right */ max_sum = sum = 0; end1 = q = seq1 + pos1; s = seq2 + pos2; while (*s != '\0' && *q != '\0' && sum >= max_sum - X) { sum += SUBSTITUTION_SCORE(*s++,*q++); if (sum > max_sum) { max_sum = sum; end1 = q; } } /* extend to the left */ min_sum = sum = 0; beg1 = q = (seq1 + pos1) - W; beg2 = s = seq2 + pos2 - W; while (s > seq2 && q > seq1 && sum >= min_sum - X) { sum += SUBSTITUTION_SCORE(*--s,*--q); if (sum > min_sum) { min_sum = sum; beg2 = s; beg1 = q; } } score = M*W + max_sum + min_sum; if (score >= K) { ++numMSPs; mp = (Msp_ptr)ckalloc(sizeof(Msp)); mp->len = end1 - beg1; mp->pos1 = beg1 - seq1; mp->pos2 = beg2 - seq2; mp->score = score; mp->next_msp = msp_list; msp_list = mp; } diag_lev[diag] = (end1 - seq1) + W; } /* ---------------- compute locally maximal sequence pairs ---------------*/ get_msps() { long d, i, score, best_score, end1, beg1, lower, upper; for (d = 1-len1; d < len2; ++d) { beg1 = best_score = score = 0; lower = max(1, 1-d); upper = min(len1, len2-d); for (i = lower; i <= upper; ++i) if (score > 0) { score += SUBSTITUTION_SCORE(s1[i], s2[i+d]); if (score > best_score) { best_score = score; end1 = i; } } else { check_score(best_score, d, beg1, end1); best_score = score = SUBSTITUTION_SCORE(s1[i], s2[i+d]); beg1 = end1 = i; } check_score(best_score, d, beg1, end1); } } check_score(score, d, beg1, end1) long score, d, beg1, end1; { char *ckalloc(); Msp_ptr mp; if (score >= K) { ++numMSPs; mp = (Msp_ptr)ckalloc(sizeof(Msp)); mp->len = end1 - beg1 + 1; mp->pos1 = beg1 - 1; mp->pos2 = beg1 + d - 1; mp->score = score; mp->next_msp = msp_list; msp_list = mp; } } /* sort_msps - order database sequence for printing */ sort_msps() { char *ckalloc(); int i; Msp_ptr mp; msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr)); for (i = 0, mp = msp_list; i < numMSPs; ++i, mp = mp->next_msp) msp[i] = mp; for (i = (numMSPs/2) - 1; i >= 0; --i) heapify(i, (int) numMSPs-1); for (i = numMSPs-1; i > 0; --i) { mp = msp[0]; msp[0] = msp[i]; msp[i] = mp; if (i > 1) heapify(0, i-1); } } /* heapify - core procedure for heapsort */ heapify(i, last) int i, last; { int lim = (last-1)/2, left_son, small_son; Msp_ptr mp; while (i <= lim) { left_son = 2*i + 1; if (left_son == last) small_son = left_son; else small_son = smaller(left_son, left_son+1); if (smaller(i, small_son) == small_son) { mp = msp[i]; msp[i] = msp[small_son]; msp[small_son] = mp; i = small_son; } else break; } } /* smaller - determine which of two sequences should be printed first */ int smaller(i, j) int i, j; { Msp_ptr ki = msp[i], kj = msp[j]; if (ki->score == kj->score) return ((ki->pos1 >= kj->pos1) ? i : j); /* print one with larger score first */ return ((ki->score <= kj->score) ? i : j); } /* -------------------------- compute significance parameters ---------------*/ get_params() { double _p[1000], *p , log(), sqrt(), pMatch, pTransit, pTransver, N; long A1, A2, C1, C2, G1, G2, T1, T2; int c, i; A1 = A2 = C1 = C2 = G1 = G2 = T1 = T2 = 0; for (i = 0; i < len1; ++i) if ((c = seq1[i]) == 'A') ++A1; else if (c == 'C') ++C1; else if (c == 'G') ++G1; else if (c == 'T') ++T1; for (i = 0; i < len2; ++i) if ((c = seq2[i]) == 'A') ++A2; else if (c == 'C') ++C2; else if (c == 'G') ++G2; else if (c == 'T') ++T2; N = ((double)len1)*((double)len2); pMatch = ((double)A1)*((double)A2) + ((double)C1)*((double)C2) + ((double)G1)*((double)G2) + ((double)T1)*((double)T2); pTransit = ((double)A1)*((double)G2) + ((double)C1)*((double)T2) + ((double)G1)*((double)A2) + ((double)T1)*((double)C2); pTransver = N - pMatch - pTransit; for (i = 0; i < V+M; ++i) _p[i] = 0.0; p = _p - V; p[V] = pTransver/N; p[I] += (pTransit/N); p[M] += (pMatch/N); if (karlin(V, M, _p, ¶m_lambda, ¶m_K) == 0) fatal("compution of significance levels failed"); } /**************** Statistical Significance Parameter Subroutine **************** Version 1.0 February 2, 1990 Version 1.1 July 5, 1990 Program by: Stephen Altschul Address: National Center for Biotechnology Information National Library of Medicine National Institutes of Health Bethesda, MD 20894 Internet: altschul@ncbi.nlm.nih.gov See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical Significance of Molecular Sequence Features by Using General Scoring Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268. Computes the parameters lambda and K for use in calculating the statistical significance of high-scoring segments or subalignments. The scoring scheme must be integer valued. A positive score must be possible, but the expected (mean) score must be negative. A program that calls this routine must provide the value of the lowest possible score, the value of the greatest possible score, and a pointer to an array of probabilities for the occurence of all scores between these two extreme scores. For example, if score -2 occurs with probability 0.7, score 0 occurs with probability 0.1, and score 3 occurs with probability 0.2, then the subroutine must be called with low = -2, high = 3, and pr pointing to the array of values { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide pointers to lambda and K; the subroutine will then calculate the values of these two parameters. In this example, lambda=0.330 and K=0.154. The parameters lambda and K can be used as follows. Suppose we are given a length N random sequence of independent letters. Associated with each letter is a score, and the probabilities of the letters determine the probability for each score. Let S be the aggregate score of the highest scoring contiguous segment of this sequence. Then if N is sufficiently large (greater than 100), the following bound on the probability that S is greater than or equal to x applies: P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ]. In other words, the p-value for this segment can be written as 1-exp[-KN*exp(-lambda*S)]. This formula can be applied to pairwise sequence comparison by assigning scores to pairs of letters (e.g. amino acids), and by replacing N in the formula with N*M, where N and M are the lengths of the two sequences being compared. In addition, letting y = KN*exp(-lambda*S), the p-value for finding m distinct segments all with score >= S is given by: 2 m-1 -y 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e Notice that for m=1 this formula reduces to 1-exp(-y), which is the same as the previous formula. *******************************************************************************/ #define MAXIT 150 /* Maximum number of iterations used in calculating K */ karlin(low,high,pr,lambda,K) int low; /* Lowest score (must be negative) */ int high; /* Highest score (must be positive) */ double *pr; /* Probabilities for various scores */ double *lambda; /* Pointer to parameter lambda */ double *K; /* Pointer to parmeter K */ { int i,j,range,lo,hi,first,last; double up,new,Sum,av; register double sum; double *p,*P,*ptrP,exp(); register double *ptr1,*ptr2,*ptr1e; char *calloc(); /* Check that scores and their associated probabilities are valid */ if (low>=0) { fprintf(stderr,"Lowest score must be negative.\n"); return 0; } for (i=range=high-low;i> -low && !pr[i];--i); if (i<= -low) { fprintf(stderr,"A positive score must be possible.\n"); return 0; } for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) { fprintf(stderr,"Negative probabilities not allowed.\n"); return 0; } if (sum<0.99995 || sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f. Normalizing.\n",sum); p= (double *) calloc(range+1,sizeof(double)); for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum); if (Sum>=0) { fprintf(stderr,"Invalid (non-negative) expected score: %.3f\n",Sum); free(p); return 0; } /* Calculate the parameter lambda */ up=0.5; do { up*=2; ptr1=p; for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(up*i); } while (sum<1.0); for (*lambda=j=0;j<25;++j) { new=(*lambda+up)/2.0; ptr1=p; for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(new*i); if (sum>1.0) up=new; else *lambda=new; } /* Calculate the pamameter K */ ptr1=p; for (av=0,i=low;i<=high;++i) av+= *ptr1++ *i*exp(*lambda*i); if (low == -1 || high == 1) { *K = high==1 ? av : Sum*Sum/av; *K *= 1.0 - exp(- *lambda); free(p); return 1; /* Parameters calculated successfully */ } Sum=lo=hi=0; P= (double *) calloc(MAXIT*range+1,sizeof(double)); for (*P=sum=j=1;j<=MAXIT && sum>0.00001;Sum+=sum/=j++) { first=last=range; for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) { ptr1=ptrP-first; ptr1e=ptrP-last; ptr2=p+first; for (sum=0;ptr1>=ptr1e;) sum+= *ptr1-- * *ptr2++; if (first) --first; if (ptrP-P<=range) --last; } for (sum=0,i=lo;i;++i) sum+= *++ptrP * exp(*lambda*i); for (;i<=hi;++i) sum+= *++ptrP; } if (j>MAXIT) fprintf(stderr,"Value for K may be too large due to insufficient iterations.\n"); for (i=low;!p[i-low];++i); for (j= -i;i1;) if (p[++i-low]) j=gcd(j,i); *K = (j*exp(-2*Sum))/(av*(1.0-exp(- *lambda*j))); free(p); free(P); return 1; /* Parameters calculated successfully */ } gcd(a,b) int a,b; { int c; if (b<0) b= -b; if (b>a) { c=a; a=b; b=c; } for (;b;b=c) { c=a%b; a=b; } return a; } /* ---------------------------------------------------------------------------*/ /* print_msp - display the i-th MSP */ print_msp(i) int i; { long len, pos1, pos2; Msp_ptr mp = msp[i]; int j, start_j, W, X, sc; double prob, significance(); pos1 = mp->pos1; pos2 = mp->pos2; len = mp->len; prob = significance(mp->score); if (!alignments || print_stats) { if (prob > .99) ++sig99; if (prob > .90) ++sig90; if (prob > .50) ++sig50; for (W = start_j = j = 0; j < len; ++j) if (seq1[pos1+j] != seq2[pos2+j]) { W = max(W, j-start_j); start_j = j+1; } W = max(W, len-start_j); for (j = 4; j <= 12; ++j) { if (prob > .99 && W < j) ++missW[j][0]; if (prob > .90 && W < j) ++missW[j][1]; if (prob > .50 && W < j) ++missW[j][2]; } for (sc = X = j = 0; j < len; ++j) if (sc < 0) { sc += SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]); if (sc < X) X = sc; } else sc = SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]); X = -X; for (j = 1; j <= 20; ++j) { if (prob > .99 && X > j) ++missX[j][0]; if (prob > .90 && X > j) ++missX[j][1]; if (prob > .50 && X > j) ++missX[j][2]; } } if (!alignments) { printf("%4d: %6d %7d %5d %6d %5.1lf%% %4d %4d\n", i+1, pos1+1, pos2+1, len, mp->score, 100*prob, W, X); return; } printf("a {\n s %d\n b %d %d\n e %d %d\n l %d %d %d %d %1.1lf\n}\n", mp->score, pos1+1, pos2+1, pos1+len, pos2+len, pos1+1, pos2+1, pos1+len, pos2+len, 100*prob); } double significance(score) int score; { double y, exp(), N; N = ((double)len1)*((double)len2); y = -param_lambda*(score); y = param_K*N*exp(y); return exp(-y); } /* ---------------------------- utility routines -------------------------*/ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }