/* Reads a file of lines of the form * c id1 id2 i j len1 len2 * where "c" is either 'S' (same) or 'O' (opposite), "id1" and "id2" * are each integers in the range 1 to 999999, "i" and "j"" are integer of * magnitude at most max(len1,len2) and len1 and len2 are positive ints. * This program computes the connected components of a graph with vertices * 1-999999 and the given edges. Command syntax is: * genedata filename */ #include /* #include "overlap.h" */ #define HASHSIZE 10001 #define NSEQS 18657 struct neighbor { struct seq *nhbr; char orientation; int i, j; struct neighbor *next; }; struct seq { int id; int len; struct neighbor *nhbrs; struct seq *next; } *hashtab[HASHSIZE]; #define MILLION 1000000 #define MAX_COMPONENT 10000 int v[MILLION]; int nedges; struct seq *component[MAX_COMPONENT]; layout(comp_size) int comp_size; { int i, id; struct neighbor *n; for (i = 0; i < comp_size; ++i) printf("F %d %d\n", component[i]->id, component[i]->len); for (i = 0; i < comp_size; ++i) { id = component[i]->id; for (n = component[i]->nhbrs; n; n = n->next) printf("%c %d %d %d %d\n", n->orientation, id, n->nhbr->id, n->i, n->j); } printf("E\n"); } components() { int i, k, j, size, rt; /* Make every tree have depth 1. */ for (i = 1; i < MILLION; ++i) if (v[i] > 0 && v[v[i]] > 0) compress(i, root(i)); /* Arrange each component as an increasing chain, where v[root]-MILLION * points to last entry. Assumes root is not the smallest entry. */ for (i = 1; i < MILLION; ++i) if (v[i] > 0 && v[i] < MILLION) { /* skip roots */ if ((k = v[v[i]]) > 0) /* not smallest entry */ v[k-MILLION] = i; v[v[i]] = MILLION + i; } /* Insert root in each chain and process the component. */ for (i = 1; i < MILLION; ++i) if (v[i] > 0) { /* point k to last entry in chain before the root */ for (size = 2, k = i; v[v[k]] < MILLION; k = v[k]) ++size; rt = v[k]; if (rt > k) v[rt] = 0; else { v[k] = 0; for (j = i; v[j] < rt; j = v[j]) ; v[rt] = v[j]; v[j] = rt; } /* PROCESS THE COMPONENT */ /* print_chain(i, size); */ build_layout(i, size); /* Mark the component as processed. */ for (j = i; (k = v[j]) != 0; j = k) v[j] = 0; } } build_layout(i, size) int size; { int j; struct seq *find_seq(); if (size > MAX_COMPONENT) fatal("Component too big."); for (j = 0; i != 0; i = v[i], ++j) if ((component[j] = find_seq(i)) == NULL) fatal("Did not find component entry."); if (j != size) fatal("Impossible layout."); layout(size); } /* print_chain(i, size) int i, size; { int j, k; printf("%6d:", size); for (j = 0, k = i; k != 0; k = v[k], ++j) { if (j == 9) { putchar('\n'); j = 0; } printf("\t%6d", k); } putchar('\n'); } */ report() { int k, m, size, maxsize, total, ncomponents, norphs; static int n[] = {1, 3, 5, 10, 20, 50, 100, 250, 1000, 10000, MILLION}; int many[(sizeof n) / (sizeof n[0])]; /* one for each value in n[] */ int nbr_n = (sizeof n) / (sizeof n[0]); for (m = 1; m < nbr_n; ++m) many[m] = 0; ncomponents = total = maxsize = 0; for (k = 1; k < MILLION; ++k) if (v[k] < 0) { ++ncomponents; size = -v[k] + 1; total += size; if (size > maxsize) maxsize = size; for (m = 1; m < nbr_n; ++m) if (n[m] >= size) { ++many[m]; break; } } /* fprintf(stderr, "%d edges and %d non-trivial components totaling %d entries.\n", nedges, ncomponents, total); norphs = NSEQS - total; fprintf(stderr, "%d orphans, for a total of %d bins\n", norphs, norphs + ncomponents); fprintf(stderr, "The maximum component size is %d.\n", maxsize); for (m = 1; m < nbr_n; ++m) if (many[m]) fprintf(stderr,"%d components of size between %d and %d\n", many[m], n[m-1]+1, n[m]); */ } main(argc, argv) int argc; char **argv; { if (argc != 2) fatalf("%s filename", argv[0]); get_input(argv[1]); /* { int i; struct seq *s; struct neighbor *n; for (i = 0; i < HASHSIZE; ++i) for (s = hashtab[i]; s; s = s->next) { printf("%d %d ->", s->id, s->len); for (n = s->nhbrs; n; n = n->next) printf(" %d(%c,%d,%d)", n->nhbr->id, n->orientation, n->i, n->j); putchar('\n'); } exit(0); } */ report(); components(); exit (0); } edge(i, j) int i, j; { int iroot, jroot, r; iroot = root(i); jroot = root(j); /* make the root of the smaller tree a son of the root of the larger */ if (iroot == jroot) r = iroot; else if (v[jroot] < v[iroot] /* these v-values are negative */ || (v[jroot] == v[iroot] && jroot > iroot)) { /* * The previous line guarantees that a root is never the * smallest entry of a component. This helps us in delivering * the components in order of smallest entry. */ v[jroot] += (v[iroot]-1); v[iroot] = r = jroot; } else { v[iroot] += (v[jroot]-1); v[jroot] = r = iroot; } compress(i, r); compress(j, r); } int root(i) int i; { while (v[i] > 0) i = v[i]; return i; } compress(i, root) int i, root; { int k; if (i != root) for ( ; (k = v[i]) != root; i = k) v[i] = root; } get_input(filename) char *filename; { FILE *fp, *ckopen(); int id1, id2, i, j, len1, len2, maxlen; char orientation; struct seq *s1, *s2, *add_seq(); fp = ckopen(filename, "r"); while (fscanf(fp, "%c %d %d %d %d %d %d\n", &orientation, &id1, &id2, &i, &j, &len1, &len2) != EOF) { if (orientation != 'S' && orientation != 'O') fatalf("%c is an improper orientation.", orientation); if (id1 <= 0 || id1 >= MILLION) fatalf("%d is an improper ID.", id1); if (id2 <= 0 || id2 >= MILLION) fatalf("%d is an improper ID.", id2); maxlen = (len1 > len2) ? len1 : len2; if (i < -maxlen || i > maxlen) fatalf("i = %d is out of range", i); if (j < -maxlen || j > maxlen) fatalf("j = %d is out of range", j); if (len1 <= 0 || len2 <= 0) fatal("sequence lengths must be positive."); edge(id1, id2); s1 = add_seq(id1, len1); s2 = add_seq(id2, len2); add_edge(s1, s2, orientation, i, j); } } add_edge(s1, s2, orientation, i, j) struct seq *s1, *s2; char orientation; int i, j; { struct neighbor *n; char *ckalloc(); ++nedges; n = (struct neighbor *)ckalloc(sizeof *n); n->orientation = orientation; n->i = i; n->j = j; n->nhbr = s2; n->next = s1->nhbrs; s1->nhbrs = n; } int hashval(n) int n; { return n % HASHSIZE; } struct seq *add_seq(id, len) { struct seq *s, *find_seq(); char *ckalloc(); int h; if ( (s = find_seq(id)) != NULL) if (len != s->len) fatalf("sequence %d has inconsistent lengths", id); else return s; s = (struct seq *)ckalloc(sizeof *s); s->id = id; s->len = len; s->nhbrs = NULL; h = hashval(id); s->next = hashtab[h]; hashtab[h] = s; return s; } struct seq *find_seq(id) int id; { struct seq *n; for (n = hashtab[hashval(id)]; n; n = n->next) if (n->id == id) break; return n; }