/* * PBLAST - Search two protein sequences for locally maximal segment pairs. * The basic command syntax is * * PBLAST sequence1 sequence2 * * where sequence1 and sequence2 name files containing protein sequences. Lines * at the beginnings of the files that don't start with an upper case letter * are discarded. Thus a typical sequence file might begin: * * >CYAA$YEAST ADENYLATE CYCLASE (EC 4.6.1.1) (ATP * MSSKPDTGSEISGPQRQEEQEQQIEQSSPTEANDRSIHDEVPKVKKRHEQNSGHKSRRNS * AYSYYSPRSLSMTKSRESITPNGMDDVSISNVEHPRPTEPKIKRGPYLLKKTLSSLSMTS * * If sequence1 and sequence2 are identical file names, then matches are * computed without "mirror image" pairs and without the trivial long match. * * The PBLAST command permits optional additional arguments, such as X=50, * that reset certain internal parameters. The available parameters are: * * T or t gives the threshold for word hits. * X or x gives the value for terminating word extensions. * K or k give the cutoff. * (defaults: T = 15, X = 20, and K = 20% significance level) */ #include #include #include /* WARNING: on some machines the flowing "char" may have to be replaced by "signed char" */ #define SMALL_VALS char /* type to hold values -128 to 127 */ #define max(x,y) ((x > y) ? (x) : (y)) #define min(x,y) ((x < y) ? (x) : (y)) #define WORD_SIZE 4 /* length of critical word (do not change!) */ #define DEFAULT_T 15 /* default extension termination cutoff */ #define DEFAULT_X 20 /* default extension termination cutoff */ #define DEFAULT_SIG 0.20 /* default significance level for setting K */ SMALL_VALS S[128][128]; char *s1, *seq1, *s2, *seq2; int numwords, /* number of critical words in hash table */ K, /* cutoff score */ T, /* threshold for word hits */ X, /* cutoff for hit extensions */ numMSPs, /* total number of MSPs recorded */ numhits; /* number of word hits */ double param_K, param_lambda; long *diag_lev, /* extent of discovered matches on a given diagonal */ len1, len2; /* sequence lengths */ typedef struct msp { long len, pos1, pos2; int score; struct msp *next_msp; } Msp, *Msp_ptr; Msp_ptr msp_list, *msp; char achars[] = "ARNDCQEGHILKMFPSTWYVBZX"; /* amino acid names */ int alpha = 23; /* alphabet size */ int wgts[23][23] = { /* the PAM200 matrix */ { 2,-2, 0, 0,-2,-1, 0, 1,-1,-1,-2,-1,-1,-3, 1, 1, 1,-5,-3, 0, 1, 1, 0}, {-2, 5, 0,-1,-3, 1,-1,-3, 1,-2,-2, 2, 0,-3, 0, 0,-1, 1,-4,-2, 0, 1, 0}, { 0, 0, 2, 2,-3, 0, 1, 0, 1,-1,-2, 1,-2,-2, 0, 1, 0,-4,-1,-2, 3, 2, 0}, { 0,-1, 2, 3,-4, 1, 3, 0, 0,-2,-3, 0,-2,-4,-1, 0, 0,-6,-3,-2, 4, 3, 0}, {-2,-3,-3,-4, 8,-4,-4,-3,-3,-2,-5,-4,-4,-4,-2, 0,-2,-6, 0,-2,-3,-3, 0}, {-1, 1, 0, 1,-4, 4, 2,-1, 2,-2,-1, 0,-1,-4, 0,-1,-1,-4,-3,-2, 2, 4, 0}, { 0,-1, 1, 3,-4, 2, 3, 0, 0,-2,-3, 0,-2,-4, 0, 0, 0,-6,-3,-2, 3, 4, 0}, { 1,-3, 0, 0,-3,-1, 0, 4,-2,-2,-3,-2,-3,-3,-1, 1, 0,-5,-4,-1, 1, 0, 0}, {-1, 1, 1, 0,-3, 2, 0,-2, 5,-2,-2, 0,-2,-1, 0,-1,-1,-3, 0,-2, 2, 2, 0}, {-1,-2,-1,-2,-2,-2,-2,-2,-2, 4, 2,-1, 2, 1,-2,-1, 0,-5,-1, 3,-1,-1, 0}, {-2,-2,-2,-3,-5,-1,-3,-3,-2, 2, 4,-2, 3, 1,-2,-2,-1,-4,-1, 1,-2,-1, 0}, {-1, 2, 1, 0,-4, 0, 0,-2, 0,-1,-2, 4, 1,-4,-1, 0, 0,-3,-4,-2, 1, 1, 0}, {-1, 0,-2,-2,-4,-1,-2,-3,-2, 2, 3, 1, 5, 0,-2,-1, 0,-4,-2, 1,-1, 0, 0}, {-3,-3,-2,-4,-4,-4,-4,-3,-1, 1, 1,-4, 0, 7,-4,-2,-2, 0, 5,-1,-2,-3, 0}, { 1, 0, 0,-1,-2, 0, 0,-1, 0,-2,-2,-1,-2,-4, 5, 1, 0,-5,-4,-1, 0, 1, 0}, { 1, 0, 1, 0, 0,-1, 0, 1,-1,-1,-2, 0,-1,-2, 1, 2, 1,-2,-2,-1, 1, 1, 0}, { 1,-1, 0, 0,-2,-1, 0, 0,-1, 0,-1, 0, 0,-2, 0, 1, 3,-4,-2, 0, 1, 0, 0}, {-5, 1,-4,-6,-6,-4,-6,-5,-3,-5,-4,-3,-4, 0,-5,-2,-4,12, 0,-6,-3,-4, 0}, {-3,-4,-1,-3, 0,-3,-3,-4, 0,-1,-1,-4,-2, 5,-4,-2,-2, 0, 7,-2,-1,-2, 0}, { 0,-2,-2,-2,-2,-2,-2,-1,-2, 3, 1,-2, 1,-1,-1,-1, 0,-6,-2, 4,-1,-1, 0}, { 1, 0, 3, 4,-3, 2, 3, 1, 2,-1,-2, 1,-1,-2, 0, 1, 1,-3,-1,-1, 4, 4, 0}, { 1, 1, 2, 3,-3, 4, 4, 0, 2,-1,-1, 1, 0,-3, 1, 1, 0,-4,-2,-1, 4, 5, 0}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, }; main(argc, argv) int argc; char *argv[]; { int i, j, v; double x, log(), sqrt(), significance(); char *ckalloc(); if (argc < 3) fatalf("%s seq1 seq2 [K=] [T=] [X=]", argv[0]); T = DEFAULT_T; X = DEFAULT_X; while (--argc > 2) { if (argv[argc][1] != '=') fatalf("argument %d has improper form", argc); v = atoi(argv[argc] + 2); switch (argv[argc][0]) { case 'K': case 'k': K = v; break; case 'T': case 't': T = v; break; case 'X': case 'x': X = v; break; default: fatal("Options are K, T, X."); } } len1 = get_seq(argv[1], &seq1); if (strcmp(argv[1],argv[2]) == 0) { seq2 = seq1; len2 = len1; } else len2 = get_seq(argv[2], &seq2); s1 = seq1 - 1; /* so subscripts start with 1 */ s2 = seq2 - 1; diag_lev = (long *)ckalloc((len1+len2+1)*sizeof(long)) + len1; /* set up scoring matrix */ for (i = 0; i < alpha; ++i) for (j = 0; j < alpha; ++j) S[achars[i]][achars[j]] = wgts[i][j]; get_params(); if (K == 0) { x = log(DEFAULT_SIG)/(-param_K*len1*len2); K = 2*log(sqrt(x))/(-param_lambda); while (significance(K-1) >= DEFAULT_SIG) --K; while (significance(K) < DEFAULT_SIG) ++K; } get_weights(); printf("#:lav\n\n"); printf("d {\n\"PBLAST output with parameters:\n"); printf("T = %d, X = %d\"", T, X); printf("\n}\n"); printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n", argv[1], len1, argv[2], len2); printf("k {\n \"significance\"\n}\n"); bld_table(); search(); sort_msps(); /* print the matches for each sequence */ for (i = 0; i < numMSPs; ++i) print_msp(i); if (strcmp(argv[1], argv[2]) == 0) { /* self-comparison; insert trivial diagonal */ printf("a {\n s 0.0\n b 1 1\n e %d %d\n", len1, len1); printf(" l 1 1 %d %d 0.0\n}\n", len1, len1); } exit (0); } /* get_seq - read a sequence */ int get_seq(file_name, seqptr) char *file_name, **seqptr; { FILE *qp, *ckopen(); char *p, *fgets(), *strchr(), *ckalloc(); int c; long n; qp = ckopen(file_name, "r"); for (n = 0; (c = getc(qp)) != EOF; ) if (c != '\n') ++n; *seqptr = ckalloc(n+1); rewind(qp); for (p = *seqptr; ; ) { if (fgets(p, (int)n, qp) == NULL) { fclose(qp); *p = '\0'; break; } if (p == *seqptr && !isupper(*p)) continue; while (isupper(*p)) ++p; if (*p != '\n' && *p != '\0') fatalf("Illegal character %c in query sequence.", *p); } return p - *seqptr; } /* ------------- build table of W-tuples in the first sequence ------------*/ SMALL_VALS maxcost[128], /* max cost of substituting for a character */ order[128][26]; /* order[i][j] = j-th best char to sub. for i */ get_weights() { int i, j; SMALL_VALS *m, *o, *p, *q, t; alpha = strlen(achars); /* for illegal characters, set weight to lowest possible value */ for (i = 0; i < 128; i++) { m = S[i]; for (j = 0; j < 128; j++) m[j] = -128; maxcost[i] = -128; } /* read substitution weights for each residue */ for (i = 0; i < alpha; i++) { m = S[achars[i]]; for (j = 0; j < alpha; j++) { m[achars[j]] = wgts[i][j]; } o = order[achars[i]]; for (j = 0; j < alpha; j++) o[j] = achars[j]; /* bubble-sort residue names by cost of substitution for i-th residue */ for (q = o + (alpha-1); q > o; --q) for (p = o; p < q; ++p) if (m[*p] < m[*q]) { t = *p; *p = *q; *q = t; } maxcost[achars[i]] = m[*o]; } } /* ----------------------- build table of critical words -----------------*/ struct hash_node { int ch; /* fourth letter of word */ struct match *matches; /* positions where word hits query sequence */ struct hash_node *link; /* next word with same first three letters */ }; struct match { int pos; /* position in query sequence */ int score; /* score where word hits */ struct match *succ; /* next hit for this word */ }; /* The hash value of a four-letter word is simply an encoding of its first three letters: hval = (word[0] - 'A')*1024 + (word[1] - 'A')*32 + word[2] - 'A'; This is space-inefficient since some hash values are unreachable, but the hash value is quickly updated as we march along a database sequence. If s points to the last letter of the new word, then the new hash value is: hval = ((hval & 01777) << 5) + (*s - 'A'); */ struct hash_node *hashtab[26426]; /* hval of "ZZZ" is 26425 */ SMALL_VALS best[WORD_SIZE]; /* best[i] = smallest score of word[0]..word[i] that can be extended to a critical word */ char word[WORD_SIZE], /* candidate for critical word */ *query; /* pointer into query sequence */ bld_table() { int i; for (query = seq1; query[3] != '\0'; query++) { best[3] = T; for (i = 3; i > 0; --i) best[i-1] = best[i] - maxcost[query[i]]; enumerate(0, 0); } } /* enumerate - try all letters in position i of candidate for a critical word; score is for word[0]..word[i-1] */ enumerate(i, score) int i, score; { SMALL_VALS *o = order[query[i]]; /* possible letters by decreasing weight */ int c, new_score; for (c = 0; c < alpha; ++c) { word[i] = o[c]; new_score = score + S[query[i]][word[i]]; if (new_score < best[i]) break; if (i < WORD_SIZE-1) enumerate(i+1, new_score); else add_word(query - seq1 + WORD_SIZE, new_score); } } /* add_word - add a word to the table of critical words */ add_word(pos, score) int pos, score; { struct hash_node *h; struct match *m; long hval; char *ckalloc(); numwords++; hval = (word[0] - 'A')*1024 + (word[1] - 'A')*32 + word[2] - 'A'; for (h = hashtab[hval]; h; h = h->link) if (h->ch == word[3]) break; if (h == NULL) { h = (struct hash_node *) ckalloc (sizeof(struct hash_node)); h->link = hashtab[hval]; hashtab[hval] = h; h->ch = word[3]; h->matches = NULL; } m = (struct match *) ckalloc (sizeof(struct match)); m->succ = h->matches; h->matches = m; m->pos = pos; m->score = score; } /* ----------------------- search the second sequence --------------------*/ search() { register struct hash_node *h; struct match *m; register char *s; register long hval; s = seq2; if (s[0] == '\0' || s[1] == '\0' || s[2] == '\0') return; hval = (s[0] - 'A')*1024 + (s[1] - 'A')*32 + s[2] - 'A'; for (s += 3; *s; ++s) { for (h = hashtab[hval]; h; h = h->link) if (h->ch == *s) { for (m = h->matches; m; m = m->succ) extend_hit(s+1, m->pos, m->score); break; } hval = ((hval & 01777) << 5) + (*s - 'A'); } } /* extend_hit - extend a word-sized hit to a longer match */ extend_hit(s_pos, q_pos, score) char *s_pos; int q_pos, score; { char *s_beg, *q_beg, *q_end, *q, *s; int min_sum, max_sum, sum, diag; Msp_ptr mp; if (seq1 == seq2 && q_pos >= (s_pos - seq2)) return; numhits++; diag = (s_pos-seq2) - q_pos; if (diag_lev[diag] > q_pos) return; /* extend to the right */ max_sum = sum = 0; s = s_pos; q_end = q = seq1 + q_pos; while (*s != '\0' && *q != '\0' && sum >= max_sum - X) if ((sum += S[*q++][*s++]) > max_sum) { max_sum = sum; q_end = q; } /* extend to the left */ min_sum = sum = 0; s_beg = s = s_pos - WORD_SIZE; q_beg = q = (seq1 + q_pos) - WORD_SIZE; while (s > seq2 && q > seq1 && sum >= min_sum - X) if ((sum += S[*--q][*--s]) > min_sum) { min_sum = sum; s_beg = s; q_beg = q; } score += max_sum + min_sum; if (score >= K) { ++numMSPs; mp = (Msp_ptr)ckalloc(sizeof(Msp)); mp->len = q_end - q_beg; mp->pos1 = q_beg - seq1; mp->pos2 = s_beg - seq2; mp->score = score; mp->next_msp = msp_list; msp_list = mp; } diag_lev[diag] = (q_end - seq1) + WORD_SIZE; } /* sort_msps - order database sequence for printing */ sort_msps() { char *ckalloc(); int i; Msp_ptr mp; msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr)); for (i = 0, mp = msp_list; i < numMSPs; ++i, mp = mp->next_msp) msp[i] = mp; for (i = (numMSPs/2) - 1; i >= 0; --i) heapify(i, (int) numMSPs-1); for (i = numMSPs-1; i > 0; --i) { mp = msp[0]; msp[0] = msp[i]; msp[i] = mp; if (i > 1) heapify(0, i-1); } } /* heapify - core procedure for heapsort */ heapify(i, last) int i, last; { int lim = (last-1)/2, left_son, small_son; Msp_ptr mp; while (i <= lim) { left_son = 2*i + 1; if (left_son == last) small_son = left_son; else small_son = smaller(left_son, left_son+1); if (smaller(i, small_son) == small_son) { mp = msp[i]; msp[i] = msp[small_son]; msp[small_son] = mp; i = small_son; } else break; } } /* smaller - determine which of two sequences should be printed first */ int smaller(i, j) int i, j; { Msp_ptr ki = msp[i], kj = msp[j]; if (ki->score == kj->score) return ((ki->pos1 >= kj->pos1) ? i : j); /* print one with larger score first */ return ((ki->score <= kj->score) ? i : j); } /* -------------------------- compute significance parameters ---------------*/ get_params() { double _p[1000], *p , log(), sqrt(), N; int lo, hi, i, j, v, count1[128], count2[128]; lo = 10000; hi = -10000; for (i = 0; i < alpha; ++i) for (j = 0; j < alpha; ++j) { v = wgts[i][j]; lo = min(lo, v); hi = max(hi, v); } for (j = 0; j < 128; ++j) count1[j] = 0; for (i = 0; i < len1; ++i) ++count1[seq1[i]]; for (j = 0; j < 128; ++j) count2[j] = 0; for (i = 0; i < len2; ++i) ++count2[seq2[i]]; for (i = 0; i < lo+hi; ++i) _p[i] = 0.0; p = _p - lo; N = ((double)len1)*((double)len2); for (i = 0; i < alpha; ++i) for (j = 0; j < alpha; ++j) p[wgts[i][j]] += ((double)count1[achars[i]])*((double)count2[achars[j]])/N; if (karlin(lo, hi, _p, ¶m_lambda, ¶m_K) == 0) fatal("compution of significance levels failed"); } /**************** Statistical Significance Parameter Subroutine **************** Version 1.0 February 2, 1990 Version 1.1 July 5, 1990 Program by: Stephen Altschul Address: National Center for Biotechnology Information National Library of Medicine National Institutes of Health Bethesda, MD 20894 Internet: altschul@ncbi.nlm.nih.gov See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical Significance of Molecular Sequence Features by Using General Scoring Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268. Computes the parameters lambda and K for use in calculating the statistical significance of high-scoring segments or subalignments. The scoring scheme must be integer valued. A positive score must be possible, but the expected (mean) score must be negative. A program that calls this routine must provide the value of the lowest possible score, the value of the greatest possible score, and a pointer to an array of probabilities for the occurence of all scores between these two extreme scores. For example, if score -2 occurs with probability 0.7, score 0 occurs with probability 0.1, and score 3 occurs with probability 0.2, then the subroutine must be called with low = -2, high = 3, and pr pointing to the array of values { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide pointers to lambda and K; the subroutine will then calculate the values of these two parameters. In this example, lambda=0.330 and K=0.154. The parameters lambda and K can be used as follows. Suppose we are given a length N random sequence of independent letters. Associated with each letter is a score, and the probabilities of the letters determine the probability for each score. Let S be the aggregate score of the highest scoring contiguous segment of this sequence. Then if N is sufficiently large (greater than 100), the following bound on the probability that S is greater than or equal to x applies: P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ]. In other words, the p-value for this segment can be written as 1-exp[-KN*exp(-lambda*S)]. This formula can be applied to pairwise sequence comparison by assigning scores to pairs of letters (e.g. amino acids), and by replacing N in the formula with N*M, where N and M are the lengths of the two sequences being compared. In addition, letting y = KN*exp(-lambda*S), the p-value for finding m distinct segments all with score >= S is given by: 2 m-1 -y 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e Notice that for m=1 this formula reduces to 1-exp(-y), which is the same as the previous formula. *******************************************************************************/ #define MAXIT 150 /* Maximum number of iterations used in calculating K */ karlin(low,high,pr,lambda,K) int low; /* Lowest score (must be negative) */ int high; /* Highest score (must be positive) */ double *pr; /* Probabilities for various scores */ double *lambda; /* Pointer to parameter lambda */ double *K; /* Pointer to parmeter K */ { int i,j,range,lo,hi,first,last; double up,new,Sum,av; register double sum; double *p,*P,*ptrP,exp(); register double *ptr1,*ptr2,*ptr1e; char *calloc(); /* Check that scores and their associated probabilities are valid */ if (low>=0) { fprintf(stderr,"Lowest score must be negative.\n"); return 0; } for (i=range=high-low;i> -low && !pr[i];--i); if (i<= -low) { fprintf(stderr,"A positive score must be possible.\n"); return 0; } for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) { fprintf(stderr,"Negative probabilities not allowed.\n"); return 0; } if (sum<0.99995 || sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f. Normalizing.\n",sum); p= (double *) calloc(range+1,sizeof(double)); for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum); if (Sum>=0) { fprintf(stderr,"Invalid (non-negative) expected score: %.3f\n",Sum); free(p); return 0; } /* Calculate the parameter lambda */ up=0.5; do { up*=2; ptr1=p; for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(up*i); } while (sum<1.0); for (*lambda=j=0;j<25;++j) { new=(*lambda+up)/2.0; ptr1=p; for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(new*i); if (sum>1.0) up=new; else *lambda=new; } /* Calculate the pamameter K */ ptr1=p; for (av=0,i=low;i<=high;++i) av+= *ptr1++ *i*exp(*lambda*i); if (low == -1 || high == 1) { *K = high==1 ? av : Sum*Sum/av; *K *= 1.0 - exp(- *lambda); free(p); return 1; /* Parameters calculated successfully */ } Sum=lo=hi=0; P= (double *) calloc(MAXIT*range+1,sizeof(double)); /* Changed from 0.00001 to 0.0001 by Webb Miller on 8/20/91, following the suggestion of Stephen Altschul. */ for (*P=sum=j=1;j<=MAXIT && sum>0.0001;Sum+=sum/=j++) { first=last=range; for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) { ptr1=ptrP-first; ptr1e=ptrP-last; ptr2=p+first; for (sum=0;ptr1>=ptr1e;) sum+= *ptr1-- * *ptr2++; if (first) --first; if (ptrP-P<=range) --last; } for (sum=0,i=lo;i;++i) sum+= *++ptrP * exp(*lambda*i); for (;i<=hi;++i) sum+= *++ptrP; } if (j>MAXIT) fprintf(stderr,"Value for K may be too large due to insufficient iterations.\n"); for (i=low;!p[i-low];++i); for (j= -i;i1;) if (p[++i-low]) j=gcd(j,i); *K = (j*exp(-2*Sum))/(av*(1.0-exp(- *lambda*j))); free(p); free(P); return 1; /* Parameters calculated successfully */ } gcd(a,b) int a,b; { int c; if (b<0) b= -b; if (b>a) { c=a; a=b; b=c; } for (;b;b=c) { c=a%b; a=b; } return a; } /* ---------------------------------------------------------------------------*/ /* print_msp - display the i-th MSP */ print_msp(i) int i; { long len, pos1, pos2; Msp_ptr mp = msp[i]; double prob, significance(); pos1 = mp->pos1; pos2 = mp->pos2; len = mp->len; prob = significance(mp->score); printf("a {\n s %d\n b %d %d\n e %d %d\n l %d %d %d %d %1.1lf\n}\n", mp->score, pos1+1, pos2+1, pos1+len, pos2+len, pos1+1, pos2+1, pos1+len, pos2+len, 100*prob); } double significance(score) int score; { double y, exp(), N; N = ((double)len1)*((double)len2); y = -param_lambda*(score); y = param_K*N*exp(y); return exp(-y); } /* ---------------------------- utility routines -------------------------*/ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }