Exercise 6.3 - Cross Referencer

Question

Write a cross-referencer that prints a list of all words in a document, and for each word, a list of the line numbers on which it occurs. Remove noise words like the, and and so on.

/*
 * Write a cross-referencer that prints a list of all words in a document, and
 * for each word, a list of the line numbers on which it occurs. Remove noise
 * words like "the" and "and" so on.
 *
 */

/*
 * 1. Add all the word structures (word structure will have word and line
 * numbers to the tree.
 * 2. A word can occur in more than one line, if the same word is found and the
 * new line number to the line numbers.
 * 3. So line numbers should be a linked list of numbers.
 * 4. Print it.
 * */

#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXWORD 1000       /* longest word that can be read by mgetword */
#define DEFAULT_COMP_LEN 8 /* default length to compare */

/*
 * tnode: Tree node from K&R2 page 140.  Words are initially read into
 * the tree by getword.
 */
struct tnode {
    char *word;
    int count;
    struct linenumber *linenumbers;
    struct tnode *left;
    struct tnode *right;
};

struct linenumber {
    int *number;
    struct linenumber *nextnumber;
};

/*
 * simroot: Part of a linked list of pointers to simword lists with
 * a common root.
 */
struct simroot {
    struct simword *firstword; /* points to the list of words */
    struct simroot *nextroot;  /* points to the next node in the list */
};

/*
 * simword: Part of a linked list of words with a common root.  Points
 * to the word in the tnodes.
 */
struct simword {
    char *word;               /* points to the word in the tree */
    int count;                /* copied from the tree */
    int linenumber;           /* copied from the tree */
    struct simword *nextword; /* next node */
};

struct tnode *addtree(struct tnode *, char *, int);

void treeprint(const struct tnode *);

int mgetword(char *, int, int *);

struct linenumber *lnumberalloc(void);

struct linenumber *addlinenumber(struct linenumber *, int);

int main(int argc, char *argv[]) {
    struct tnode *root;
    char word[MAXWORD];
    int len;
    int lineno = 0;

    /* get all the words */
    root = NULL;
    while (mgetword(word, MAXWORD, &lineno) != 'x')
        if (isalpha(word[0]))
            root = addtree(root, word, lineno);

    if (argc == 1)
        len = DEFAULT_COMP_LEN;
    else if (argc == 2)
        len = atoi(argv[1]);
    else {
        printf("Incorrect number of arguments.\n");
        return 1;
    }

    printf("Words with line numbers\n");

    treeprint(root); /* prints all the words */

    return 0;
} /* end of main() */

struct tnode *talloc(void);

char *mstrdup(char *);

/* mgetword from Ex6.1 */

#define IN 1
#define OUT 0

int mgetword(char *word, int lim, int *lineno_addr) {
    int c, d, getch(void), comment, string, directive;
    void ungetch(int);
    char *w = word;

    comment = string = directive = OUT;

    while (isspace(c = getch())) {
        if (c == '\n') {

            *lineno_addr = *lineno_addr + 1;
        }
    }

    /* Check if inside a comment */

    if (c == '/') {
        if ((d = getch()) == '*') {
            comment = IN;
        } else {
            comment = OUT;
            ungetch(d);
        }
    }

    /* Check if inside a quote */

    if (c == '\"') {
        string = IN;
    }

    /* Check if inside a directive */

    if (c == '#') {
        directive = IN;
    }

    if (c == '\\') {
        c = getch(); /* ignore the \\ character */
    }

    if (comment == OUT && string == OUT && directive == OUT) {

        if (c != EOF)
            *w++ = c;

        if (!isalnum(c) && c != '_') {
            *w = '\0';
            return c;
        }

        for (; --lim > 0; w++) {
            *w = getch();
            if (!isalnum(*w) && *w != '_') {
                ungetch(*w);
                break;
            }
        }
        *w = '\0';
        return word[0];
    } else if (comment == IN) {
        *w++ = c;
        *w++ = d;

        while ((*w++ = c = getch())) {
            if (c == '*') {
                if ((c = getch()) == '/') {
                    *w++ = c;
                    comment = OUT;
                    break;
                } else {
                    ungetch(c);
                }
            }
        }
        *w = '\0';

    } else if (string == IN) {
        *w++ = c;
        while ((*w++ = getch()) != '\"') {
            if (*w == '\\') /* Take care of escaped quotes */
                *w++ = getch();
        }
        string = OUT;
        *w = '\0';
    } else if (directive == IN) {
        *w++ = c;
        while ((*w++ = getch()) != '\n') {
            if (c == '\\') { /* Take care of continuation line escape */
                *w++ = getch();
            }
        }
        directive = OUT;
        *w = '\0';
    }

    return c;
}

/***************************************************************************
 *                    All code below here is from K&R2.                    *
 ***************************************************************************/

/*
 * addtree: From K&R2 page 141.
 * Adds a node containing w, at or below node p.
 */
struct tnode *addtree(struct tnode *p, char *w, int linenumber) {
    int cond;

    if (p == NULL) { /* new word */
        p = talloc();
        p->word = mstrdup(w);
        p->count = 1;
        p->linenumbers = NULL;
        p->linenumbers = addlinenumber(p->linenumbers, linenumber);
        p->left = p->right = NULL;
    } else if ((cond = strcmp(w, p->word)) == 0) {
        p->count++;
        p->linenumbers = addlinenumber(p->linenumbers, linenumber);
    } else if (cond < 0)
        p->left = addtree(p->left, w, linenumber);
    else
        p->right = addtree(p->right, w, linenumber);
    return p;
}

struct linenumber *addlinenumber(struct linenumber *p, int linenumber) {
    if (p == NULL) {
        p = lnumberalloc();
        p->number = linenumber;
        p->nextnumber = NULL;
    } else {
        p->nextnumber = addlinenumber(p->nextnumber, linenumber);
    }

    return p;
}

struct linenumber *lnumberalloc(void) {
    return (struct linenumber *) malloc(sizeof(struct linenumber));
}

/* treeprint: From K&R2 page 142. Prints tree p in-order. */
void treeprint(const struct tnode *p) {
    if (p != NULL) {
        treeprint(p->left);
        printf("\n%s :", p->word);
        printnumbers(p->linenumbers);
        treeprint(p->right);
    }
}

void printnumbers(const struct linenumber *p) {
    if (p != NULL) {
        printf("%d,", p->number);
        printnumbers(p->nextnumber);
    }
}

/* talloc: From K&R2 page 142. Makes a tnode. */
struct tnode *talloc(void) {
    return (struct tnode *) malloc(sizeof(struct tnode));
}

/* strdup: From K&R2 page 143. Makes a duplicate of s. */
char *mstrdup(char *s) {
    char *p;
    p = (char *) malloc(strlen(s) + 1);
    if (p != NULL)
        strcpy(p, s);
    return p;
}

/*
 * getch and ungetch are from K&R2, page 79
 */
#define BUFSIZE 100

char buf[BUFSIZE]; /* buffer for ungetch() */
int bufp = 0;      /* next free position in buf */

int getch(void) { /* get a (possibly pushed back) character */
    return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c) { /* push character back on input */
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
    return;
}

Explanation

Here is an example execution of this program.

This is a
cross reference
word
document
creator
lists words and their line numbers.
Gets the word and puts their line numbers.
x

Words with line numbers

Gets :6,
This :0,
a :0,
and :5,6,
creator :4,
cross :1,
document :3,
is :0,
line :5,6,
lists :5,
numbers :5,6,
puts :6,
reference :1,
the :6,
their :5,6,
word :2,6,
words :5,