#include #include #include #include #include #include using namespace std; // These are some macros that can be used to generate lex functions // from abstract lex rules. #define START() Coord start= getCoord(); string result; #define FAIL() { setCoord(start); return string(""); } #define KEEP(c) result += c; #define SUCCESS() return result; namespace Prs { // A coordinate in the input. // It is used by the lexer to remember its current input position. // It can also be stored by lex functions or syntax functions, // to backtrack to this position when the parsing process gets stuck. struct Coord { int m_position; // Position in the input stream, counting from 0. unsigned int m_lineNumber; // Line number in input file, counting from 1. unsigned int m_columnNumber; // Column number in current line, counting from 1. Coord() : m_position(0), m_lineNumber(1), m_columnNumber(1) { } // No need for a copy constructor or assignment operator: // the C++ compiler provides implicit ones which work fine for us. }; // Access to an input stream, regardless of the kind of stream it is // (string in memory, text file, network connection, ...). // You have to inherit from this class and fill its character buffer // (e.g. by reading it from a file). Everything else is already provided // by this base class. class InputStream { protected: char* m_data; // Buffer containing all the input data. int m_size; // Size of the buffer in bytes. Coord m_coord; // Current location in input data. public: // ... void report_error(const char* msg) { printf("InputStream error: %s\n", msg); abort(); } const Coord& getCoord() { return m_coord; } void setCoord(const Coord& c) { m_coord= c; } // Return current character and move 1 position to the right. char getChar() { if(isEof()) report_error("Reached end of file."); char c= m_data[m_coord.m_position++]; // Move right. // Update the coordinates automatically. if(c == '\n') { m_coord.m_columnNumber= 1; m_coord.m_lineNumber++; } else { m_coord.m_columnNumber++; } return c; } // Return current character without moving. char peekChar() { if(isEof()) report_error("Reached end of file."); return m_data[m_coord.m_position]; // Do NOT move right. } bool isEof() { return (m_coord.m_position >= m_size); } // Alternative if the input cannot contain zeros: // return (m_data[m_coord.m_position] == 0); bool isEol() { return isEof() || (peekChar() == '\n'); } }; // Access to a file: just copy its contents into the character buffer. class InputFile : public InputStream { private: std::string m_name; // Path name of the input file. public: InputFile(const char* filename) { // Remember the file's name. Could be useful for error messages. m_name= filename; // First figure out how long the file is. // This code was tested on linux; it might be slightly different on windows. struct stat fileStat; lstat(filename, &fileStat); m_size= fileStat.st_size; // Then read the entire file into memory. // Note that we inherit 'm_data' and 'm_size' from the InputStream base class. m_data= (char*)malloc((m_size+1)*sizeof(char)); // Provide space for the trailing '\0'. FILE* fid= fopen(filename, "rb"); fread(m_data, sizeof(char), m_size, fid); m_data[m_size]= 0; // Trailing '\0' to mark end of file. fclose(fid); } }; ///////////////////////////////////////////////////////////////////////////// // Turns a stream of characters into a stream of tokens. // The tokens are simply created on demand and returned as strings; // they are not explicitly stored in a list of tokens. class Lexer { private: InputStream* m_input; // Assume only 1 input stream, not a stack of them. public: Lexer(const char* filename) { m_input= new InputFile(filename); } ~Lexer() { delete m_input; } // Helper functions. int getLineNumber() { return m_input->getCoord().m_lineNumber; } int getColumnNumber() { return m_input->getCoord().m_columnNumber; } // Access to the underlying input stream: just dispatch to 'm_input'. // When using a stack of input streams: dispatch to top of stack. const Coord& getCoord() { return m_input->getCoord(); } void setCoord(const Coord& c) { m_input->setCoord(c); } char getChar() { return m_input->getChar(); } char peekChar() { return m_input->peekChar(); } bool isEol() { return m_input->isEol(); } bool isEof() { return m_input->isEof(); } // ... virtual int parse_white_space(); virtual string parse_int_literal(); virtual string parse_real_literal(); virtual string parse_identifier(); bool match_identifier(const char* str); static bool isDigit(char c) { return (c>='0' && c<='9'); } static bool isLowercaseLetter(char c) { return (c>='a' && c<='z'); } static bool isUppercaseLetter(char c) { return (c>='A' && c<='Z'); } static bool isLetter(char c) { return (isLowercaseLetter(c) || isUppercaseLetter(c)); } static bool isLetterOrDigit(char c) { return (isLetter(c) || isDigit(c)); } }; int Lexer::parse_white_space() { int start_col= getColumnNumber(); while(peekChar() == ' ' || peekChar() == '\n' || peekChar() == '\r') { getChar(); // Note that 'getChar' skips the character, but 'peekChar' does not. } return getColumnNumber() - start_col; // Return number of characters skipped. } string Lexer::parse_identifier() { string result; if(!isLetter(peekChar())) { return string(""); } result += getChar(); while(true) { if(!isLetterOrDigit(peekChar())) break; result += getChar(); } return result; } string Lexer::parse_int_literal() { string result; while(true) { if(!isDigit(peekChar())) break; result += getChar(); } return result; } string Lexer::parse_real_literal() { Coord start= getCoord(); // Store the starting coordinate. string result= parse_int_literal(); // Use our earlier function. if(result.size() == 0) return string(""); // No digits found. if(getChar() != '.') { setCoord(start); // Backtrack to the beginning. return string(""); } result += '.'; string part2= parse_int_literal(); // Our earlier function again. if(part2.size() == 0) { setCoord(start); // Backtrack to the beginning. return string(""); } result.append(part2); return result; } bool Lexer::match_identifier(const char* str) { Coord start= getCoord(); string id= parse_identifier(); if(id == "") { return false; } if(id != str) { setCoord(start); return false; } return true; } ///////////////////////////////////////////////////////////////////////////// /////// // Many tools generate lex functions from a high-level description // (typically using something like regular expressions). // This is what the generated code might look like. // The macros are defined at the top of this file. class GeneratedLexer : public Lexer { public: GeneratedLexer(const char* filename) : Lexer(filename) {} /* // Expression for integer tokens: int_literal= [0-9]+ */ // Resulting code (details omitted): string parse_int_literal() { START(); if(!isDigit(peekChar())) FAIL(); while(isDigit(peekChar())) KEEP(getChar()); SUCCESS(); } /* // Expression for identifiers: identifier= [a-zA-Z][a-zA-Z0-9]+ */ // Resulting code (details omitted): string parse_identifier() { START(); if(!isLetter(peekChar())) FAIL(); while(isLetterOrDigit(peekChar())) KEEP(getChar()); SUCCESS(); } /* // Expression for real literals: real_literal= int_literal '.' int_literal */ // Resulting code (details omitted): string parse_real_literal() { START(); { string result1= parse_int_literal(); if(result1.size() == 0) FAIL(); result.append(result1); } if(peekChar() != '.') FAIL(); KEEP(getChar()); { string result3= parse_int_literal(); if(result3.size() == 0) FAIL(); result.append(result3); } SUCCESS(); } }; ///////////////////////////////////////////////////////////////////////////// /////// // Syntax nodes. // A structure that represents a function call. // Below, we will parse something like 'f(g(h()))' and turn it into // a tree of these structs. struct FunctionCall { string m_name; list m_arguments; void dump() { string indent= ""; dump(indent); } void dump(const string& indent) { printf("%s- %s\n", indent.c_str(), m_name.c_str()); string subindent= indent; subindent += " "; for(list::iterator iter= m_arguments.begin(); iter != m_arguments.end(); ++iter) { (*iter)->dump(subindent); } } }; // A structure that represents a statement. struct Statement { void dump() { string indent= ""; dump(indent); } virtual void dump(const string& indent) = 0; }; // We support 'if' statements and assignment statements. struct IfStatement : public Statement { FunctionCall* m_condition; list m_body; virtual void dump(const string& indent) { printf("%s- IF\n", indent.c_str()); string subindent= indent; subindent += " "; m_condition->dump(subindent); for(list::iterator iter= m_body.begin(); iter != m_body.end(); ++iter) { (*iter)->dump(subindent); } } }; struct AssignmentStatement : public Statement { string m_variableName; FunctionCall* m_value; virtual void dump(const string& indent) { printf("%s- ASSIGNMENT to '%s'\n", indent.c_str(), m_variableName.c_str()); string subindent= indent; subindent += " "; m_value->dump(subindent); } }; /////// // Parser. // Turns a stream of tokens (from the lexer) into a tree structure. // Uses recursive calls of syntax methods to create the tree top-down. // This is known as "recursive descent parsing" and is the easiest // technique for hand-written parsers. class Parser { private: Lexer* m_lexer; public: Parser(const char* filename) { m_lexer= new Lexer(filename); } virtual ~Parser() { delete m_lexer; } // ... FunctionCall* parse_function_call(); Statement* parse_statement(); AssignmentStatement* parse_assignment_statement(); IfStatement* parse_if_statement(); bool parse_int_list(std::list& lst); }; FunctionCall* Parser::parse_function_call() { Coord start= m_lexer->getCoord(); // Remember our position for backtracking. // Try to parse the function name. m_lexer->parse_white_space(); string name= m_lexer->parse_identifier(); if(name == "") return NULL; // Fail by returning a NULL pointer. // Now look for the opening '(' of the call. m_lexer->parse_white_space(); if(m_lexer->getChar() != '(') { m_lexer->setCoord(start); // Backtrack to original position! return NULL; // Fail by returning a NULL pointer. } // Create the resulting node instance. // From now on, we can't fail anymore! FunctionCall* result= new FunctionCall; // Fill the structure with the data we have so far (i.e. the function name). result->m_name= name; // Now parse the arguments, separated by a comma. char separator= ','; while(true) { // Each argument is in turn a function call. // We call 'parse_function_call' recursively, // hence the name "recursive descent parsing". FunctionCall* arg= parse_function_call(); // This also skips white space. if(!arg) break; // No more arguments => stop the loop. // Store the parsed argument as a subtree of our FunctionCall tree. result->m_arguments.push_back(arg); // Find a separator between the arguments. m_lexer->parse_white_space(); if(m_lexer->peekChar() != separator) break; m_lexer->getChar(); // Skip the separator (we only peeked it). } // After the arguments, find the closing ')'. m_lexer->parse_white_space(); if(m_lexer->getChar() != ')') { // Syntax error: Missing ')'. Do NOT fail! } return result; } Statement* Parser::parse_statement() { // First try an 'if' statement. It takes care of white space, // so we don't have to worry about that here. IfStatement* st1 = parse_if_statement(); if(st1) return st1; // If the 'if' fails, it leaves the input unchanged (because it follows // the rules of good parsing behavior), so we can simply try the assignment // statement right after it. AssignmentStatement* st2 = parse_assignment_statement(); if(st2) return st2; // ... (try other kinds of statements next) // The attempt at parsing any kind of statement failed. // They all left the input unchanged. We now fail in turn, // leaving everything unchanged and returning NULL. return NULL; } AssignmentStatement* Parser::parse_assignment_statement() { Coord start= m_lexer->getCoord(); // Remember our position for backtracking. // First, look for the name of the variable // on the left-hand-side of the assignment. m_lexer->parse_white_space(); string variableName= m_lexer->parse_identifier(); if(variableName == "") { // No variable name => no assignment statement. return NULL; } // Now look for the assignment operator. // In a more robust parser, we would verify that this is not // some other operator like '=>'. m_lexer->parse_white_space(); if(m_lexer->peekChar() != '=') { m_lexer->setCoord(start); // Backtrack, to put the identifier back in the input. return NULL; } m_lexer->getChar(); // Skip the '=' (we only peeked it). // At this point, we *know* that we have an assignment statement. // We create the object, and agree not to fail anymore beyond this point. AssignmentStatement* result= new AssignmentStatement; result->m_variableName= variableName; // Now parse the right-hand-side expression (just call the other syntax function). result->m_value= parse_function_call(); // Also takes care of leading white space. if(!result->m_value) { // Syntax error: Missing expression. Do NOT fail! } // Now look for the semicolon that terminates the statement. m_lexer->parse_white_space(); if(m_lexer->peekChar() != ';') { // Syntax error: Missing ';'. Do NOT fail! } m_lexer->getChar(); // Skip the ';' (we only peeked it). return result; } IfStatement* Parser::parse_if_statement() { Coord start= m_lexer->getCoord(); // Remember our position for backtracking. // First look for the 'if' keyword, which we here treat as a normal identifier for simplicity. m_lexer->parse_white_space(); if(!m_lexer->match_identifier("if")) { return NULL; } // At this point, we *know* that we have an 'if' statement. IfStatement* result= new IfStatement; // Now look for the condition expression. // Note that we do not require parentheses around the condition, for simplicity. result->m_condition= parse_function_call(); // This also takes care of white space. if(!result->m_condition) { // Syntax error: Missing expression. } // Now look for the opening brace of the 'if' body. m_lexer->parse_white_space(); if(m_lexer->peekChar() != '{') { // Syntax error: Missing '{'. } m_lexer->getChar(); // Skip the '{' (we only peeked it). // And now for the body: a sequence of statements. while(true) { Statement* st= parse_statement(); if(!st) break; // Body is done. result->m_body.push_back(st); } // Now look for the closing brace of the 'if' body. m_lexer->parse_white_space(); if(m_lexer->peekChar() != '}') { // Syntax error: Missing '}'. } m_lexer->getChar(); // Skip the '}' (we only peeked it). // This simple example does not support 'else' yet. return result; } // Parses a list of integers into the 'lst' argument. bool Parser::parse_int_list(std::list& lst) { // First look for the opening '['. m_lexer->parse_white_space(); if(m_lexer->peekChar() != '[') { return false; // Fail. This is not an int-list. } m_lexer->getChar(); // Skip the peeked character. // Parse the first integer element. // If it's not present, we just have an empty list, which is fine. string s= m_lexer->parse_int_literal(); if(s.length() > 0) { lst.push_back(s); // The following loop looks for the other elements in the list, // each one preceded by a comma. while(true) { m_lexer->parse_white_space(); if(m_lexer->peekChar() == ',') { m_lexer->getChar(); // Skip the peeked character. } else { break; // No comma => end of list. } m_lexer->parse_white_space(); s= m_lexer->parse_int_literal(); if(s.length() <= 0) { // Syntax error: comma should be followed by element. break; // Stop the loop. } lst.push_back(s); } } if(m_lexer->peekChar() != ']') { // Syntax error: missing ']'. } m_lexer->getChar(); // Skip the peeked character. return true; } } // End of namespace Prs ///////////////////////////////////////////////////////////////////////////// // A simple test function for lexers. void test(Prs::Lexer& lex) { { string s= lex.parse_identifier(); // This should find 'The'. printf("ID: '%s'\n", s.c_str()); } { int i= lex.parse_white_space(); printf("WS: %d\n", i); } { string s= lex.parse_int_literal(); // This should fail. printf("INT: '%s'\n", s.c_str()); } { string s= lex.parse_identifier(); // This should find 'number'. printf("ID: '%s'\n", s.c_str()); } { int i= lex.parse_white_space(); printf("WS: %d\n", i); } { string s= lex.parse_identifier(); // This should fail. printf("ID: '%s'\n", s.c_str()); } { string s= lex.parse_real_literal(); // This should fail. printf("REAL: '%s'\n", s.c_str()); } { string s= lex.parse_int_literal(); // This should find '42'. printf("INT: '%s'\n", s.c_str()); } { int i= lex.parse_white_space(); printf("WS: %d\n", i); } { string s= lex.parse_identifier(); // This should find 'or'. printf("ID: '%s'\n", s.c_str()); } { int i= lex.parse_white_space(); printf("WS: %d\n", i); } { string s= lex.parse_real_literal(); // This should find the real literal. printf("REAL: '%s'\n", s.c_str()); } for(int i= 0; i<10; ++i) { Prs::Coord coord= lex.getCoord(); char c1= lex.peekChar(); char c2= lex.getChar(); printf("Found character: '%c' = '%c' at %d,%d\n", c1, c2, coord.m_lineNumber, coord.m_columnNumber); } } int main() { { printf("\n------- Normal lexer:\n"); Prs::Lexer lex("input_01.txt"); test(lex); } { printf("\n------- Generated lexer:\n"); Prs::GeneratedLexer lex("input_01.txt"); test(lex); } { printf("\n------- Function calls:\n"); Prs::Parser prs("input_02.txt"); Prs::FunctionCall* f= prs.parse_function_call(); f->dump(); delete f; } { printf("\n------- Statements:\n"); Prs::Parser prs("input_03.txt"); Prs::Statement* f= prs.parse_statement(); f->dump(); delete f; } { printf("\n------- Int list:\n"); Prs::Parser prs("input_04.txt"); std::list lst; prs.parse_int_list(lst); printf("Parsed %d elements:\n", lst.size()); for(std::list::iterator iter= lst.begin(); iter != lst.end(); ++iter) { printf("- '%s'\n", (*iter).c_str()); } } }