Logo Search packages:      
Sourcecode: aime version File versions  Download package

lexer.cpp

/******************************************************************************
 **
 ** Lexer - a package with procedures to get tokens from a file and place 
 **         all pertinant information about it in the token record. The 
 **         reason this is used is to provide some abstraction between the 
 **         text of the file and the parser. Only these functions have to 
 **         worry about file IO and string formats, while the parser gets 
 **         all file information through the lexer. This provides some 
 **         abstraction between the file and the parser, which will make 
 **         the parser less complex in the long run. All token records 
 **         should be freed from memory after they are used to save on memory 
 **         usage by the generate function
 **
 **         There are a few thing you should know when using this package.
 **         First thing it does is check the first character at the file
 **         pointer. If it finds an alpha, it will assume it is some sort
 **         of identifier defined in the Token_Names table. It will keep
 **         getting characters that fit the identifier definition defined
 **         below. If the first character is a number, it will assume that
 **         it is a numeric and keep getting characters until it doesn't find
 **         a numerical character anymore. (It will not get floating point
 **         or any other non-integer numbers) If it finds any other sort of
 **         character it will just grab that one character. If you want the
 **         lexer to grab more than one, you will have to hard code in a
 **         look-ahead yourself. An example of this would be wanting to pass
 **         back '++' as a single token.
 **
 **         If the token is not a number, it will look up the token type. First
 **         thing it does is look in TokenNames. In this table you can
 **         specify strings and associate them with numbers using defines.
 **         for instance, you could do something like this:
 **
 **         char *TokenNames[] = {"ident1", "ident2"};
 **
 **         #define T_IDENT1   1
 **         #define T_IDENT2   2
 **         #define T_LAST     3
 **
 **         T_LAST must be the one plus the total number of elements in the
 **         table or you will get a core dump or something worse!
 **         If it does not find the element in any of the tables, it will check
 **         to see if it is one of these listed below:
 **         
 **         Identifiers - must start with an alpha character. After that 
 **                       it can have an alphanumeric or an underscore
 **         Numeric - just numeric characters
 **         String - all between quotes, apostrophies, or carrots. Will only 
 **                  grab this if you specify an "until" char for get_token
 **         
 **
 ** Functions provided:
 **       get_token  - function that gets a token from a file and returns a 
 **                    token record,
 **                    defined in gen.h
 **       xfopen     - used to open a file for lexing
 **       free_token - removes the token or strings of tokens (in the case of 
 **                    a description or examine) from memory 
 **
 **
 **
 ** Copyright (C) 2000 George Noel (Slate)
 **
 **   This program is free software; you can redistribute it and/or modify
 **   it under the terms of the GNU General Public License as
 **   published by the Free Software Foundation; either version 2 of the 
 **   License, or any later version. 
 **
 **   This program is distributed in the hope that it will be useful, but 
 **   WITHOUT ANY WARRANTY; without even the implied warranty of 
 **   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 **   General Public License for more details. 
 ** 
 **   You should have received a copy of the GNU General Public License 
 **   along with this program (in the docs dir); if not, write to the Free
 **   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 **    
 ******************************************************************************/

#define LEXER_C

#ifndef CONVERTER
#include "config.h"
#include "sysdep.h"
#include "mudtypes.h"
#include "lexer.h"
#include "newfuncts.h"
#include "memchk.h"
#else
#include "../../include/sysdep.h"
#include "convconfig.h"
#endif

/* Needed a way to ensure that every token is freed so I created this double
   stack system. The new stack fills up to 10 tokens. When it is full, it
   is transferred over to the old stack and held there. Meanwhile the new
   stack fills up to 10 again. When it is full, the old stack tokens are
   freed and the new stack is moved to the old stack again */

token_record *new_stack;    /* holds a stack of tokens, used to keep track of 
                               for freeing later */
token_record *old_stack;    /* a stack of older tokens. This is the stack
                               that will be freed when the new stack is full */
int           num_in_stack; /* the number in the new stack */
token_data    *token_tree = NULL;


/********************************************************************************
 **
 ** lowercase - converts a string to entirely lowercase
 **
 ** Parameters: str - the string to convert
 **
 ** ret - same string passed in, but in lowercase
 **
 ******************************************************************************/

char *xlowercase (char *str, int thesize)
{ char *oldstring;
  char *newstring;
  char *returnstr;

  newstring = new char[thesize];

  returnstr = newstring;
  for (oldstring = str; ((*oldstring != '\0') && (*oldstring)); oldstring++)
  {
    if (isupper (*oldstring))
      *newstring = tolower (*oldstring);
    else
      *newstring = *oldstring;
    newstring++;
  }
  *newstring = '\0';
  return returnstr;
}


/********************************************************************************
 **
 ** uppercase - converts a string to entirely uppercase. Not really used
 **             anymore much but ill leave it in just in case
 **
 ** Parameters: str - the string to convert
 **
 ** ret - same string passed in, but in lowercase
 **
 *******************************************************************************/


char * uppercase (char *str)
{ char *p;

  for (p = str; *p; p++)
    if (islower (*p))
      *p = toupper (*p);
  return str;
}

/********************************************************************************
 **
 ** lowercase - converts a string to entirely lowercase. Works differently
 **             than xlowercase
 **
 ** Parameters: str - the string to convert
 **
 ** ret - same string passed in, but in lowercase
 **
 *******************************************************************************/

char * lowercase (char *str)
{ char *p;

  if (str == NULL)
     return NULL;

  for (p = str; *p; p++)
    if (isupper (*p))
      *p = tolower (*p);
  return str;
}


/*******************************************************************************
 **
 ** free_token - frees up the memory a token or string of tokens were occupying
 **
 ** Parameters: tokenptr - the token record to free
 **
 **
 *******************************************************************************/

void free_token(token_record *tokenptr)
{
  /* if they sent us a bogus pointer, exit */
   if (tokenptr == NULL)
      return;

   /* if we have a linked list of tokens, free all in list before freeing this
      one */
   if (tokenptr->more != NULL)
      free_token(tokenptr->more);

   /* free this token */
   delete_token_record(tokenptr);
   tokenptr = NULL;
}

/*******************************************************************************
 **
 ** free_token_data - frees up the memory a token data object were occupying
 **
 ** Parameters: tokenptr - the token data to free
 **
 **
 *******************************************************************************/

void free_token_data(token_data *tokenptr)
{
  /* if they sent us a bogus pointer, exit */
   if (tokenptr == NULL)
      return;

   /* if we have a linked list of tokens, free all in list before freeing this
      one */
   free_token_data(tokenptr->left);
   free_token_data(tokenptr->right);

   /* free this token */
   delete_token_data(tokenptr);
   tokenptr = NULL;
}

/*******************************************************************************
 **
 ** free_stack - frees a stack of tokens
 **
 ** Parameters: tokenptr - the token record to add
 **
 **
 *******************************************************************************/

void free_stack(token_record *tokenptr)
{
   token_record *tmp_ptr;
   token_record *free_ptr; 

   tmp_ptr = tokenptr;
   tokenptr = NULL;

   while (tmp_ptr != NULL)
   {
      free_ptr = tmp_ptr;
      tmp_ptr = tmp_ptr->next_tok;
      free_token(free_ptr);
   }
}

/*****************************************************************************
 **
 ** free_all - frees all existing tokens
 **
 ** Parameters: tokenptr - the token record to add
 **
 **
 *****************************************************************************/

void free_all(void)
{
   free_stack(old_stack);
   free_stack(new_stack);
   old_stack = new_stack = NULL;
   free_token_data(token_tree);
   token_tree = NULL;
}


/*******************************************************************************
 **
 ** add_to_stack - adds a token to the new stack
 **
 ** Parameters: tokenptr - the token record to add
 **
 **
 *******************************************************************************/

void add_to_stack(token_record *tokenptr)
{
   token_record *tmp_ptr;
 
   tmp_ptr = new_stack;
   
   /* if this is the first token, do this */
   if (tmp_ptr == NULL)
   {
      new_stack = tokenptr;
      new_stack->next_tok = NULL;
      num_in_stack=1;
      return;
   }

   /* if there are already tokens on the stack, go to the end then add this */
   while (tmp_ptr->next_tok != NULL)
      tmp_ptr = tmp_ptr->next_tok;

   tokenptr->next_tok = NULL;
   tmp_ptr->next_tok = tokenptr;
   num_in_stack++;
}


/*****************************************************************************
 **
 ** find_token_data - find the token data element in this binary tree
 **
 **    Params: branch - the tree branch we are checking
 **            the_name - the token name we are seeking
 **
 **    returns: 1 for success, -1 for failure
 **
 ****************************************************************************/

int find_token_data(token_data *branch, char *the_name)
{
   int results;

   if (branch == NULL)
   {
      return -1;
   }

   if ((results = 
               STRCASECMP(branch->token_name, the_name)) < 0)
   {
      return find_token_data(branch->left, the_name);
   }
   else if (results > 0)
   {
      return find_token_data(branch->right, the_name);
   }
   else
      return branch->token_type;
}

/****************************************************************************
 **
 ** get_ttype - returns the token type given a certain string. In order for
 **             this to work right, T_BOOLEAN, T_OTHER, and T_IDENTIFIER must
 **             be set.
 **
 ** Parameters: tokenstr - the pointer to the token string
 **
 ** ret - the integer fitting with the token defines found in lexer.h
 **
 **
 ***************************************************************************/

int get_ttype(char *tokenstr)
{
   char *tempstr; /* holds the lowercase form of tokenstr */
   int the_last;  /* marks the last token in the table */
   int results;

   tempstr = xlowercase(tokenstr, TOKENSTRLEN);

   /* mark the last element of the table */
   the_last = (T_LAST-1);

   /* search the table for a match */

   if (token_tree == NULL)
      create_tree();

   if ((results = find_token_data(token_tree, tempstr)) != -1)
   {
      delete tempstr;
      return results;
   }

   /* free the lowercase string */
   delete tempstr;

   /* if it wasnt in the table, and it isnt an alphanumeric, then mark it
      as OTHER */
   if (!isalnum(*tokenstr))
      return T_OTHER;

   /* if its a boolean, mark it as such */
   if (!STRCASECMP(tokenstr, "true") || !STRCASECMP(tokenstr, "false"))
      return T_BOOLEAN;

   /* if all conditions above fail, it must be an identifier since it has to
      be an alpha first character */   
   return T_IDENTIFIER;
}


/****************************************************************************
 **
 ** goto_next_char - positions the file pointer right at the next character
 **
 ** Parameters: thefile - the file pointer for the next
 **
 ** ret - 1 for success, -1 for failure
 **
 **
 ****************************************************************************/

int goto_next_char(FILE *thefile)
{
   char thechar = '\r';  /* holds the character gotten from the file */

   /* while it doesnt equal one of the terminating characters, move forward */
   while ((thechar == '\r') || (thechar == '\t') || (thechar == ' ') || 
                                                           (thechar == '\n'))
   {
      /* if we hit the end of the file, return an error */
      if (feof(thefile))
         return -1;
 
      thechar = (char ) fgetc(thefile);
   }
   
   /* unget the last character, so we can get it later */
   ungetc((int) thechar, thefile);

   return 1;
}

/*****************************************************************************
 **
 ** check_next_char - returns the next char that isnt \t, \r, \n, or space. 
 **                   This function also puts the char back in the stream 
 **                   to avoid disturbing it
 **
 ** Parameters: thefile - the file pointer for the next
 **
 ** ret - the character found, or space for end of file
 **
 ****************************************************************************/

char check_next_char(FILE *thefile)
{
   char thechar = '\r';  /* holds the char gotten from the file */

   while ((thechar == '\r') || (thechar == '\t') || (thechar == ' ') ||
          (thechar == '\n')) 
   {
     /* if we get an end of file, return a space */
      if (feof(thefile))
         return ' ';

      thechar = (char ) fgetc(thefile);
   }
   
   /* unget the character since we just want to pass it back, not use it up */
   ungetc((int) thechar, thefile);
   return thechar;
}


/*******************************************************************************
 **
 ** xend_of_line - checks if the next valid char is a newline
 **
 ** Parameters: thefile - the file pointer for the next
 **
 ** ret - 1 for it is end of line, 0 for not, -1 if it finds eof
 **
 **
 ******************************************************************************/

int xend_of_line(FILE *thefile)
{
   char thechar = '\r';  /* holds the character we get */

   /* move along until we find a real character */
   while ((thechar == '\r') || (thechar == '\t') || (thechar == ' ')) 
   {
      if (feof(thefile))
         return -1;
 
      thechar = (char ) fgetc(thefile);
   }
 
   /* we have found a newline, return true */
   if (thechar == '\n')
   {
      ungetc((int) thechar, thefile);
      return 1;  
   }
   
   /* put the last character back */
   ungetc((int) thechar, thefile);
   return 0;
}

/*****************************************************************************
 **
 ** next_line - positions the file pointer at the beginning of the next line, 
 **             ignoring all it reads in
 **
 ** Parameters: thefile - the file pointer for the next
 **
 ** ret - a pointer to a string struct if success, a NULL if we found a string
 **       that was too long
 **
 **
 *****************************************************************************/

string_record *get_str_until(FILE *thefile, char until1, char until2)
{ 
   string_record *temp_str;  /* temporary holder of a string */
   string_record *str_ptr;   /* points to a string as well */
   int           count = 0;  /* counts the length of the strings */
   char          nextchar;   /* holds the char gotten from the file */

   /* creates a new string record */
   temp_str = new_string_record();
   temp_str->the_string = new char[TOKENSTRLEN];
   temp_str->the_string[0] = '\0';
   str_ptr = temp_str;

   nextchar = (char) fgetc(thefile);

   if (feof(thefile))
      return temp_str;

   /* while we dont hit an until character */
   while ((nextchar != until1) && (nextchar != until2)) 
   {  
      /* get the next character and put it in the string */ 
      str_ptr->the_string[count] = nextchar;
      nextchar = (char) fgetc(thefile);
      if (feof(thefile))
          return temp_str;

      /* if we hit a newline, start a new string structure and get the
         next string */
      if (nextchar == '\n')
      {
         str_ptr->the_string[count+1] = '\0';
         str_ptr->the_string = 
          (char *)realloc(str_ptr->the_string,strlen(str_ptr->the_string) + 1);
         str_ptr->nextstr = new_string_record();
         str_ptr = str_ptr->nextstr;
         str_ptr->the_string = new char[TOKENSTRLEN];
         str_ptr->the_string[0] = '\0';
         count = -1;
         nextchar = (char) fgetc(thefile);
         if (feof(thefile))
            return temp_str;
      }
      count++;

      /* if we have a string that is too long, return null */
      if (count >= TOKENSTRLEN)
      {
         delete_string_record(temp_str);
         return NULL;
      }

   } 
   /* clean up and return the string */
   ungetc((int) nextchar, thefile);
   str_ptr->the_string = (char *)realloc(str_ptr->the_string,
                                         strlen(str_ptr->the_string) + 1);
   return temp_str;
}

/*****************************************************************************
 **
 ** next_line - positions the file pointer at the beginning of the next line, 
 **             ignoring all it reads in
 **
 ** Parameters: thefile - the file pointer for the next
 **
 ** ret: 1 for success, -1 for end of file reached
 **
 **
 ****************************************************************************/

int next_line(FILE *thefile)
{
   char thechar = '\0';

   while (thechar != '\n')
   {
      if (feof(thefile))
         return -1;
 
      thechar = (char ) fgetc(thefile);
   }

   return 1;
}


/*****************************************************************************
 **
 ** add_token_data - adds the token data element to this binary tree
 **
 **    Params: branch - the tree branch we are checking
 **            token_num - the token number we are adding
 **
 **    returns: 1 for success, -1 for failure
 **
 ****************************************************************************/

int add_token_data(token_data **branch, int token_num)
{
   token_data *new_data;
   int results;

   if (*branch == NULL)
   {
      new_data = new_token_data();
      new_data->token_name = TokenNames[token_num];
      new_data->token_type = token_num+1;
      new_data->left = NULL;
      new_data->right = NULL;

      *branch = new_data;
      return 1;
   }

   if ((results = 
               STRCASECMP((*branch)->token_name, TokenNames[token_num])) < 0)
   {
      return add_token_data(&((*branch)->left), token_num);
   }
   else if (results > 0)
   {
      return add_token_data(&((*branch)->right), token_num);
   }
   else
      return -1;
}


/*****************************************************************************
 **
 ** create_tree - creates the token lookup tree
 **
 ** returns: 1 for success, -1 for failure
 **
 ****************************************************************************/

int create_tree(void)
{
   int i;

   if (token_tree != NULL)
      return -1;

   for (i=0; i < ( T_LAST - 1 ); i++)
   {
      add_token_data(&token_tree, i);
   }
   return 1;
}


/*****************************************************************************
 **
 ** get_token - gets the next token or description from the file and returns a 
 **             token record containing the following:
 **               - the_type   - the type of the token, all types defined in 
 **                              gen.h
 **               - the_string - the string of the token
 **               - more       - if until is defined and it gets more than one 
 **                              line, this will point to the next token 
 **                              record holding the next string
 **
 ** Parameters: thefile - the file pointer, indicating where to get a token 
 **                       from
 **             until -   gets a long line of strings until a specific 
 **                       character, like for descriptions 
 **
 ** ret - a pointer to a token record. If it was successful, the_type will 
 **       contain a number greater than 0. If not, it will contain one of the 
 **       following numbers:
 **          -1 = reached end of file
 **          -2 = number of strings too long (greater than 100) for the room 
 **               description which probably signifies they forgot to close 
 **               off a string
 **          -3 = invalid token - started with an integer, but had an alpha 
 **               character in it such as 12bob. A number must be only a 
 **               number.
 **          -4 = the string length exceeded TOKENSTRLEN (128?) without finding
 **               a \n
 **
 *****************************************************************************/

token_record *get_token(FILE *thefile, char until)
{
   token_record *tokenptr;  /* holds the struct to store the token info in */
   token_record *strptr;    /* Used to str. together tokens for a desc. list */
   char nextchar;           /* holds the next char. to examine from the file */
   int counter = 0;         /* counts the num. of chars in the next token */
   int size_list = 0;       /* checks size of linked list for > 100, 
                               i.e. a problem */
   int num_chars;           /* number of chars gotten in the current string */
   int count = 0;           /* counts the number of chars gotten in a line */

   /* if the stack is large enough, remove the items */
   if (num_in_stack == 10)
   {
      free_stack(old_stack);
      old_stack = new_stack;
      new_stack = NULL;
      num_in_stack = 0;
   }

   /* creates a new token structure */
   tokenptr = new_token_record();
   tokenptr->the_string[0] = '\0';
   tokenptr->token_type = 0;
   tokenptr->more = NULL;

   /* if they passed in a file that was already at the end */
   if (feof(thefile))
   {
      tokenptr->token_type = -1;
      tokenptr->more = NULL;
      add_to_stack(tokenptr);
      return tokenptr;
   } 

   /* get a character */
   nextchar = (char ) fgetc(thefile);

   if (until == '\0')
   {
      for ( ; ; )
      {

         /* if it is a tab, carriage return, newline, or space ignore it */
         if ((nextchar == '\r') || (nextchar == '\t') || (nextchar == ' ') ||
             (nextchar == '\n'))
            counter = 0;

         /* if it is an identifier */
         else if (isalpha(nextchar))
         {
            while (isalnum(nextchar) || (nextchar == '_') || (nextchar == '@'))
            {
               tokenptr->the_string[counter] = nextchar;
               counter++;
               nextchar = (char) fgetc(thefile);
            }
            ungetc((int) nextchar, thefile);
            tokenptr->the_string[counter] = '\0';
            tokenptr->token_type = get_ttype(tokenptr->the_string);
            tokenptr->more = NULL;
            add_to_stack(tokenptr);
            return tokenptr;
         }

         /* if it is a digit */
         else if (isdigit(nextchar))
         {
            while (isdigit(nextchar))
            {
               tokenptr->the_string[counter] = nextchar;
               counter++;
               nextchar = (char) fgetc(thefile);
            }

            /* make sure we dont have a number first with alpha following 
               somewhere, since if the first character of a token is a 
               number, we dont want any following characters to be an alpha, 
               like 12bob would be invalid. bob12 would be fine */
            if (isalpha(nextchar))
            {
               tokenptr->token_type = -3;
               tokenptr->more = NULL;
               add_to_stack(tokenptr);
               return tokenptr;
            }

            ungetc((int) nextchar, thefile);
            tokenptr->the_string[counter] = '\0';
            tokenptr->token_type = T_NUMERICAL;
            tokenptr->more = NULL;
            add_to_stack(tokenptr);
            return tokenptr;
         }

         /* it must be some sort of non-identifer */
         else
         {
            tokenptr->the_string[0] = nextchar;
            tokenptr->the_string[1] = '\0';
            tokenptr->token_type = get_ttype(tokenptr->the_string);
            tokenptr->more = NULL;
            add_to_stack(tokenptr);
            return tokenptr;
         }

         /* get another character */
         nextchar = (char ) fgetc(thefile);
         if (feof(thefile))
         {
            tokenptr->token_type = -1;
            tokenptr->more = NULL;
            add_to_stack(tokenptr);
            return tokenptr;
         }
      }
   
   }

   /* they have specified an until character, get all strings until the until 
      char */
   else
   {
      num_chars = 0;

      /* set strptr to the first token structure */
      strptr = tokenptr;

      /* while we dont get the until or end of file, keep getting characters */
      while (!feof(thefile) && (nextchar != until))
      {
         size_list = 0;

         /* if we hit a newline create a new token struct and attach it to 
            the more element, hence we have a linked list of strings */
         if (nextchar == '\n')
         {
            count = 0;
            num_chars = 0;
            nextchar = (char ) fgetc(thefile);
            if (nextchar == until)
            {
               strptr->more = NULL;
            }
            else
            {
               size_list++;
               strptr->the_string[counter] = '\0';
               counter = 0;
               strptr->more = new_token_record();
               strptr = strptr->more;

               /* the description is waaay too large, probably a forgotten 
                  'until' character */
               if (size_list > 100)
               {
                  tokenptr->token_type = -2;
                  tokenptr->more = NULL;
                  add_to_stack(tokenptr);
                  return tokenptr;
               }
            }

         }

         /* if we hit the end of file, we allow it and just stop getting 
            strings */         
         else if (!feof(thefile))
         {
            strptr->the_string[counter] = nextchar;
            counter++;
            nextchar = (char ) fgetc(thefile);
            num_chars++;
            if (num_chars >= (TOKENSTRLEN-1))
          {
               tokenptr->token_type = -4;
               tokenptr->more = NULL;
               add_to_stack(tokenptr);
               return tokenptr; 
            }
         }
      }
      strptr->the_string[counter] = '\0';
      strptr->token_type = T_STRING;
      strptr->more = NULL;

      add_to_stack(tokenptr);
      return tokenptr;
   }
   return NULL;
}

#undef LEXER_C

Generated by  Doxygen 1.6.0   Back to index