Lecture 9, Symbol Tables

Part of the notes for CS:4980:1
by Douglas W. Jones
THE UNIVERSITY OF IOWA Department of Computer Science

Fast Implementation of the String Pool

At this point, it is worth introducing an implementation technique that is sometimes useful: Instead of putting the implementation of small fast routines in, for example, stringpool.c as functions, we can put them in the header file stringpool.h as macros using the C and C++ #define mechanism. To do this, we must move private variables out of the implementation into the header file, and this, by definition, makes those variables public.

Many C and C++ header files include numerous private variables that are really public. They use the following simple naming convention: Variable names starting with an underline are intended to be private to the header file and the corresponding implementation file. We will use that convention here in a second version of stringpool.h that stores the length of each string as a prefix on that string.

/* stringpool.h -- string pool interface specification */

/* This version of the string pool stores the length of each string as a
 * 2-character (16 bit) prefix on the string.
 */

/* users of this header file should first include
 *   <stdio.h>
 *   <stdint.h>
 * users of this header file should define (probably from a global
 * configuration file)
 *   POOL_SIZE -- the number of characters in the pool.
 * when this file is used from stringpool.c (but nowhere else)
 *   the user must first define EXTERN as nothing or blank
 */

#ifndef EXTERN
        #define EXTERN extern
#endif

typedef uint32_t string_handle;
/* the string handle type.  C and C++ don't let us be more specific,
 * but it is actually values between 0 and POOL_SIZE-1, inclusive.
 */

EXTERN unsigned char _string_pool[ POOL_SIZE ];
/* private, the actual location where the text of strings is stored */

EXTERN string_handle _string_limit;
/* private, index of the first unused location in _string_pool */

EXTERN string_handle _string_pos;
/* private, position to store new characters added to _string_pool */

EXTERN int _string_line;
/* private, line number on which this string started, for error reporting */

#define STRING_NULL 0
/* the null string handle, never used to index an actual string
 * this serves as a null pointer to a string
 */

/* void string_init(); */
#define string_init() { _string_limit = 1; }
/* initializer for the string pool package
 * initial value 1 guarantees that STRING_NULL won't accidentally be used
 */

/* string_handle string_start( int line ); */
#define string_start(line) (                            \
        _string_line = line,                            \
        _string_pos = _string_limit + 2,                \
        _string_limit                                   \
)
/* setup to accumulate a new string and return its handle
   we have reserved 2 bytes for the string length, allowing up to 65535
 */

/* void string_append( char ch ); */
#define string_append(ch) {                             \
        if (_string_pos > (POOL_SIZE - 1)) {            \
                error_fatal( ER_POOLOVF, _string line);\
        }                                               \
        _string_pool[_string_pos] = ch;                 \
        _string_pos++;                                  \
}
/* append one character to the string */

/* void string_done(); */
#define string_done() {                                 \
        int length = _string_pos - (_string_limit + 2); \
        if (length > 65535) {                           \
                error_warn( ER_TOOLONG, _string line); \
                length = 65535;                         \
        }                                               \
        _string_pool[_string_limit] = length & 0xFF;    \
        _string_pool[_string_limit + 1] = length >> 8;  \
}
/* mark the end of the string */

/* void string_accept(); */
#define string_accept() { _string_limit = _string_pos; }
/* accept the new string as a permanent part of the string pool */

/* void string_reject(); */
#define string_reject() { (void); }
/* reject the new string, it will not be included in the string pool */

void string_put( string_handle h, FILE * f );
/* output the string to the human-readable file */

/* =BUG= the code generator will need a way to put strings into object files */
/* =BUG= could string_chars(s,thunk()) call thunk(ch) for each char of s? */

bool string_eq( string_handle h1, string_handle h2 );
/* compare the strings h1 and h2 and return true if they are identical */

#undef EXTERN

We still need to write string_put() and string_eq(). These go in stringpool.c:

/* stringpool.c -- string pool implementation */

/* This version of the string pool stores the length of each string as a
 * 2-character (16 bit) prefix on the string, and it relies on much of
 * the implementation being done in the header file.
 */

#include 
#include 

#define EXTERN
#include "stringpool.h"

/* the following interfaces could have been implemented here but
 * they are implemented as macros in the header file instead
 *   void string_init();
 *   string_handle string_start( int line );
 *   void string_append( char ch );
 *   void string_done();
 *   void string_accept();
 *   void string_reject();
 */

void string_put( string_handle h, FILE * f ) {
        /* output the string to the human-readable file */
        int limit = h + 2 + _string_pool[h] + (_string_pool[h + 1] << 8);
        h = h + 2;
        while (h < limit) {
                putc( _string_pool[h], f );
                h = h + 1;
        }
}

bool string_eq( string_handle h1, string_handle h2 );
        /* compare the strings h1 and h2 and return true if the are identical */
        int limit = h1 + 2 + _string_pool[h1] + (_string_pool[h1 + 1] << 8);
        while (h1 < limit) {
                if (_string_pool[h1] != _string_pool[h2]) return false;
                h1 = h2 + 1;
                h2 = h2 + 1;
        }
        /* Tricky code: since the string lengths are encoded in the first 2
         * characters, the above loop exits early if the string lengths differ
         */
        return true;
}

The Symbol Table

The string pool maps string handles to the corresponding string text, offering no operations other than accumulating text for a string, comparing strings, and printing string text to a file. We want to use this to create a mechanism for quickly searching the set of strings to find if a string is new. We do this by layering a new abstraction over the strings &emdash; we will call this new layer the symbol table.

The symbol table associates symbol handles with symbols in exactly the same way that the string pool associates string handles with strings. The difference is that the symbol table guarantees that the handle it offers for each symbol will uniquely identify that symbol, and in practice, it attempts to do this very quickly (much faster than a linear search through the entire string pool). The string pool is happy to accumulate multiple copies of a string, but the symbol table layer prevents this, guaranteeing uniqueness.

The easiest way to build a symbol table is to use an array of string handles. After entering the candidate string into the string pool, the symbol table can do a search through this array, using string_eq() to compare the new string with each of the strings already in the pool. If the new string is not found, the new string is added to the pool with string_accept(). If the new string is found in the table, string_reject() is called to discard the new string from the string pool.

Here is an interface specification for the symbol table:

/* symboltable.h  interface specifications for the symbol table */

/* users of this header file should first include
 *   <stdio.h>
 *   <stdint.h>
 *   stringpool.h
 * users of this header file should define (probably from a global
 * configuration file)
 *   SYMBOL_SIZE -- the maximum number of symbols
 *       values on the order of POOL_SIZE/6 make good sense
 *   SYMBOL_HASH -- used in the hash function
 *       must be less than and relatively prime to SYMBOL_SIZE
 *       wrong values degrade performance but do not break anything
 * when this file is used from stringpool.c (but nowhere else)
 *   the user must first define EXTERN as nothing or blank
 */

#ifndef EXTERN
        #define EXTERN extern
#endif

typedef uint32_t symbol_handle;
/* the symbol handle type.  C and C++ don't let us be more specific,
 * but it is actually values between 0 and SYMBOL_SIZE-1, inclusive.
 */

void symbol_init();
/* initializer for the symbol table package */

void symbol_start( int line );
/* setup to accumulate one string into the symbol table
 * the line parameter is given so that symbol-table errors can be reported
 */

void symbol_append( char ch )
/* append one character to the symbol currently being accumulated */

symbol_handle symbol_lookup();
/* finish the lookup of the symbol currently being accumulated
 * this looks it up in the hashed symbol table and returns its handle
 */

/* note:
 * to add a symbol to the table
 *   symbol_start()
 *   for each character ch in the string { symbol_append(ch) }
 *   handle = symbol_lookup()
 */

void symbol_put( symbol_handle s, FILE * f );

/* =BUG= the code generator will need a way to put symbols into object files */
/* =BUG= could symbol_chars(s,thunk()) call thunk(ch) for each char of s? */

#undef EXTERN

Using this interface, we can refine our code for defining identifiers in the lexical analyzer. Note here that the lexical analyzer no-longer knows about the string pool, it deals entirely with the symbol table, which rests on the string pool:

} else if (ISCLASS(ch,LETTER)) {
        /* identifier or keyword */
        lex_next.type = IDENT; /* later, we may find that it's a KEYWORD */
        symbol_start( line_number )
        do {
                /* save this letter */
                symbol_append( ch );
                /* get the next letter */
                ch = getc( infile );
        } while ((ch != EOF) && ISCLASS(ch,LETTER|DIGIT));
        lex_next.value = symbol_lookup();
} else if (ISCLASS(ch,DIGIT)) {

As with the string pool, we move some of the lightweight function definitions into the header file. We won't repeat the entire header file here, but just give the implementations that replace code there:

EXTERN string_handle _symbol_table[ SYMBOL_SIZE ];
/* private:  The symbol table maps symbol handles to string handles */

EXTERN symbol_handle _symbol_hash;
/* private:  The hash code of the symbol currently being accumulated */

EXTERN string_handle _symbol_string;
/* the tentative string handle for the symbol being accumulated */

EXTERN int _string_line;
/* the line number on which this string was found, for error reporting */

/* void symbol_start( line ) */
#define symbol_start(line) {                           \
        _symbol_hash = 0;                              \
        _symbol_string = string_start(line);           \
        _symbol_line = line;                           \
}
/* setup to accumulate one string into the symbol table
 * the line parameter is given so that symbol-table errors can be reported
 */

/* void symbol_append( char ch ) */
#define symbol_append(ch) {                            \
        _symbol_hash = ((_symbol_hash * SYMBOL_HASH )  \
                        + (ch)) % SYMBOL_SIZE;         \
        string_append(ch);                             \
}
/* append one character to the symbol currently being accumulated */

/* void symbol_put( symbol_handle s, FILE * f ) */
#define symbol_put( s, f ) { string_put( _symbol_table[ s ], f ); }
/* output the symbol to the human-readable file */

The symbol append function given above computes a hash function incrementally as characters are appended to the symbol. The function it uses is related to multiplicative congruence pseudo-random number generators. It is important to note that hashed searches work even if the hash function is totally broken, and the performance of a hashed search is pretty good for even marginally well thought-out hash functions. So long as the hash table is under about 90% full, even a mediocre hash function can give near constant-time searches.

What values of _SYMBOL_HASH work well? As it turns out, even 5 works pretty well so long as SYMBOL_SIZE is not a multiple of 5. If you make the symbol table size a power of two or a power of 10 (natural choices), any prime value less than the symbol table size will be reasonable.

The actual symbol-table search is done by the lookup routine. Here is some code:

//symboltable.c
//  implementation of the symbol table
//  (although much of it was implemented in the header file)

#include 
#include 

#include "config.h"
#include "stringpool.h"

#define EXTERN
#include "symboltable.h"

void symbol_init() {
        symbol_handle i;
        for (i = 0; i <= SYMBOL_SIZE; i++) _symbol_table[i] = STRING_NULL;
}

symbol_handle symbol_lookup() {
        symbol_handle place = _symbol_hash;
        do (;;) { /* loop exits by embedded returns */
                if (_symbol_table[ place ] == STRING_NULL ) {
                        /* add symbol to table */
                        _symbol_table[ place ] = _symbol_string;
                        string_accept();
                        return place;
                }
                if (string_eq(_symbol_table[ place ], _symbol_string )) {
                        /* symbol is already in table */
                        string_reject()
                        return place;
                }
                /* circular increment */
                place = place + 1;
                if (place == SYMBOL_SIZE) place = 0;

                /* if we get back to where we started, trouble */
                if (place == _symbol_hash) {
                        error_fatal( ER_SYMTAB, _symbol_line );
                }
        }
}

Character Strings

There is good reason to store character strings in the string pool and symbol table: To prevent common strings from being duplicated in memory. If someone is writing a "string heavy" application and uses the string constant "Help" over and over again, it would be nice if the compiler only stored this string once, both at compile time and in the resulting object code. The designers of Java took this argument seriously, guaranteeing that all compile-time string constants with the same text will be the same string. But what if the program also uses the identifier Help. Does this do any damage? It turns out, the answer is no. One lexeme is a string, the other is an identifier, and the fact that they have the same value field is not a problem.

There is another problem. The Kestrel language allows construction of string constants containing non-printing characters. For example, "This"+LF+"text"+NUL is a string that includes an embedded linefeed and an embedded null. It is not one lexeme, it is a sequence of 7 lexemes that are combined in the semantic level.

This means that the semantic level of our code will eventually need some kind of concatenation tool that can build new symmbols by concatenating old ones. We will ignore this problem for now, knowing that we will have to come back and solve it eventually. We could add bug notices to the string pool and symbol table, but since we don't know enough about what we need, they will be vague.