/* * sim3.driver.c - a sample program for calling sim3. * * Nov. 4, 1995. * * The command syntax is * sim3 file1 file2 [dist_limit] * where file1 and file2 contain arbitrary sequences of characters. It is * assumed that the two character strings are very similar. Sim3 reports * an alignment of the two sequences. * * If dist_limit > 0, a heuristic phase 1 is used. * If dist_limit <= 0, an exact phase 1 is used. * Absolute(dist_limit) specifies the tolerable distance limit. * (Zero means a default distance limit is used.) * * For example, * sim3 file1 file2 (A heuristic phase 1 is used with default limit.) * sim3 file1 file2 500 (A heuristic phase 1 is used with limit=500.) * sim3 file1 file2 0 (An exact phase 1 is used with default limit.) * sim3 file1 file2 -500 (An exact phase 1 is used with limit=500.) */ #include #include #define SUB 0 #define INS 1 #define DEL 2 #define DEF_LIMIT 1000 /* Edit_script is the internal representation of an alignment. */ typedef struct edit_script { char op_type; /* SUB, INS, or DEL */ int num; /* Number of operations */ struct edit_script *next; } edit_script; extern edit_script *sim3(char *, int, char *, int, int, int *); extern void Print_Script(edit_script *, int, int); extern void Script_free(edit_script *); int get_seq(char *, char **); void fatal(char *); void fatalf(char *, char *); FILE *ckopen(char *, char *); char *ckalloc(int); main(int argc, char *argv[]) { char *A, *B; int M, N; int limit = DEF_LIMIT; int dist; edit_script *head; if (argc < 3) fatalf("%s sequence1 sequence2 [dist_limit]", argv[0]); M = get_seq(argv[1], &A); N = get_seq(argv[2], &B); if (argc > 3) limit = atoi(argv[3]); head = sim3(A, M, B, N, limit, &dist); printf("seq1=%s, length=%d\n", argv[1], M); printf("seq2=%s, length=%d\n", argv[2], N); printf("distance=%d\n", dist); /* Print out edit scripts */ printf("\n************ Edit Scripts ************\n"); Print_Script(head, M, N); printf("**************************************\n"); Script_free(head); exit(0); } /* get_seq - read a sequence */ int get_seq(char *file_name, char **seqptr) { FILE *qp, *ckopen(); char *p, *fgets(), *strchr(); int c, n; qp = ckopen(file_name, "r"); for (n=0; (c=getc(qp)) != EOF;) if (c != '\n') ++n; *seqptr = ckalloc(n+1); rewind(qp); p = *seqptr; for (p = *seqptr; ; ) { if (fgets(p, (int)n, qp) == NULL) { fclose(qp); *p = '\0'; break; } if (p == *seqptr && !isupper(*p)) continue; while (isupper(*p)) { /* if (strchr("ACGT", *p) == NULL) *p = 'A'; */ ++p; } if (*p != '\n' && *p != '\0') { fprintf(stderr, "Illegal character %c in file %s.\n", *p, file_name); exit(1); } } *p = '\0'; return p - *seqptr; } /* fatal - print message and die */ void fatal(char *msg) { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ void fatalf(char *msg, char *val) { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(char *name, char *mode) { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return fp; } /* ckalloc - allocate space; check for success */ char *ckalloc(int amount) { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return p; }