Mercurial > lcfOS
view python/libs/compiler/parsergen.py @ 88:f3fe557be5ed
Split off of items to reduce file size
author | windel |
---|---|
date | Tue, 27 Nov 2012 18:00:13 +0100 |
parents | 60cc36ef5a50 |
children |
line wrap: on
line source
""" This module parses source code into an abstract syntax tree (AST) """ from .symboltable import SymbolTable from .nodes import * from .errors import CompilerException, Error from .modules import loadModule from .display import printNode from .builtin import * from . import assembler class Parser: def __init__(self, tokens): """ provide the parser with the tokens iterator from the lexer. """ self.tokens = tokens self.NextToken() self.errorlist = [] def Error(self, msg): raise CompilerException(msg, self.token.row, self.token.col) # Lexer helpers: def Consume(self, typ=''): if self.token.typ == typ or typ == '': v = self.token.val self.NextToken() return v else: self.Error('Excected: "{0}", got "{1}"'.format(typ, self.token.val)) def hasConsumed(self, typ): if self.token.typ == typ: self.Consume(typ) return True return False def NextToken(self): self.token = self.tokens.__next__() # TODO: store filename in location? self.location = (self.token.row, self.token.col) # Helpers to find location of the error in the code: def setLocation(self, obj, location): obj.location = location return obj def getLocation(self): return self.location """ Recursive descent parser functions: A set of mutual recursive functions. Starting symbol is the Module. """ def parseModule(self): self.imports = [] loc = self.getLocation() self.Consume('module') modname = self.Consume('ID') self.Consume(';') mod = Module(modname) # Construct a symbol table for this program mod.symtable = SymbolTable() # Add built in types and functions: for x in [real, integer, boolean, char, chr_func]: mod.symtable.addSymbol(x) self.cst = mod.symtable self.parseImportList() self.parseDeclarationSequence() # Procedures only allowed in this scope self.parseProcedureDeclarations() if self.hasConsumed('begin'): mod.initcode = self.parseStatementSequence() else: mod.initcode = EmptyStatement() self.Consume('end') endname = self.Consume('ID') if endname != modname: self.Error('end denoter must be module name') self.Consume('.') mod.imports = self.imports return self.setLocation(mod, loc) # Import part def parseImportList(self): if self.hasConsumed('import'): self.parseImport() while self.hasConsumed(','): self.parseImport() self.Consume(';') def parseImport(self): loc = self.getLocation() modname = self.Consume('ID') mod = loadModule(modname) self.setLocation(mod, loc) self.cst.addSymbol(mod) # Helper to parse an identifier defenitions def parseIdentDef(self): loc = self.getLocation() name = self.Consume('ID') ispublic = self.hasConsumed('*') # Make a node of this thing: i = Id(name) i.ispublic = ispublic return self.setLocation(i, loc) def parseIdentList(self): ids = [ self.parseIdentDef() ] while self.hasConsumed(','): ids.append( self.parseIdentDef() ) return ids def parseQualIdent(self): """ Parse a qualified identifier """ name = self.Consume('ID') if self.cst.has(Module, name): modname = name mod = self.cst.get(Module, modname) self.Consume('.') name = self.Consume('ID') # Try to find existing imported symbol: for imp in self.imports: if imp.modname == modname and imp.name == name: return imp # Try to find the symbol in the modules exports: for sym in mod.exports: if sym.name == name: impsym = ImportedSymbol(modname, name) impsym.typ = sym.typ impsym.signature = mod.signature self.imports.append(impsym) return impsym self.Error("Cannot find symbol {0}".format(name)) else: return self.cst.getSymbol(name) # Helper to parse a designator def parseDesignator(self): """ A designator designates an object. The base location in memory is denoted by the qualified identifier The actual address depends on the selector. """ loc = self.getLocation() obj = self.parseQualIdent() typ = obj.typ selectors = [] while self.token.typ in ['.', '[', '^']: if self.hasConsumed('.'): field = self.Consume('ID') if typ is PointerType: selectors.append(Deref()) typ = typ.pointedType if not type(typ) is RecordType: self.Error("field reference, type not record but {0}".format(typ)) typ = typ.fields[field] selectors.append(Field(field)) elif self.hasConsumed('['): indexes = self.parseExpressionList() self.Consume(']') for idx in indexes: if not type(typ) is ArrayType: self.Error('Cannot index non array type') if not isType(idx.typ, integer): self.Error('Only integer expressions can be used as an index') selectors.append(Index(idx, typ)) typ = typ.elementType elif self.hasConsumed('^'): selectors.append(Deref()) typ = typ.pointedType return self.setLocation(Designator(obj, selectors, typ), loc) # Declaration sequence def parseDeclarationSequence(self): """ 1. constants, 2. types, 3. variables """ self.parseConstantDeclarations() self.parseTypeDeclarations() self.parseVariableDeclarations() # Constants def evalExpression(self, expr): if type(expr) is Binop: a = self.evalExpression(expr.a) b = self.evalExpression(expr.b) if expr.op == '+': return a + b elif expr.op == '-': return a - b elif expr.op == '*': return a * b elif expr.op == '/': return float(a) / float(b) elif expr.op == 'mod': return int(a % b) elif expr.op == 'div': return int(a / b) elif expr.op == 'or': return a or b elif expr.op == 'and': return a and b else: self.Error('Cannot evaluate expression with {0}'.format(expr.op)) elif type(expr) is Constant: return expr.value elif type(expr) is Designator: if type(expr.obj) is Constant: return self.evalExpression(expr.obj) else: self.Error('Cannot evaluate designated object {0}'.format(expr.obj)) elif type(expr) is Unop: a = self.evalExpression(expr.a) if expr.op == 'not': return not a elif expr.op == '-': return -a else: self.Error('Unimplemented unary operation {0}'.format(expr.op)) else: self.Error('Cannot evaluate expression {0}'.format(expr)) def parseConstExpression(self): e = self.parseExpression() return self.evalExpression(e), e.typ def parseConstantDeclarations(self): """ Parse const part of a module """ if self.hasConsumed('const'): while self.token.typ == 'ID': i = self.parseIdentDef() self.Consume('=') constvalue, typ = self.parseConstExpression() self.Consume(';') c = Constant(constvalue, typ, name=i.name, public=i.ispublic) self.setLocation(c, i.location) self.cst.addSymbol(c) # Type system def parseTypeDeclarations(self): if self.hasConsumed('type'): while self.token.typ == 'ID': typename, export = self.parseIdentDef() self.Consume('=') typ = self.parseStructuredType() self.Consume(';') t = DefinedType(typename, typ) self.cst.addSymbol(t) def parseType(self): if self.token.typ == 'ID': typename = self.Consume('ID') if self.cst.has(Type, typename): typ = self.cst.get(Type, typename) while type(typ) is DefinedType: typ = typ.typ return typ else: self.Error('Cannot find type {0}'.format(typename)) else: return self.parseStructuredType() def parseStructuredType(self): if self.hasConsumed('array'): dimensions = [] dimensions.append( self.parseConstExpression() ) while self.hasConsumed(','): dimensions.append( self.parseConstExpression() ) self.Consume('of') arr = self.parseType() for dimension, consttyp in reversed(dimensions): if not isType(consttyp, integer): self.Error('array dimension must be an integer type (not {0})'.format(consttyp)) if dimension < 2: self.Error('array dimension must be bigger than 1 (not {0})'.format(dimension)) arr = ArrayType(dimension, arr) return arr elif self.hasConsumed('record'): fields = {} while self.token.typ == 'ID': # parse a fieldlist: identifiers = self.parseIdentList() self.Consume(':') typ = self.parseType() self.Consume(';') for i in identifiers: if i.name in fields.keys(): self.Error('record field "{0}" multiple defined.'.format(i.name)) fields[i.name] = typ # TODO store this in another way, symbol table? self.Consume('end') return RecordType(fields) elif self.hasConsumed('pointer'): self.Consume('to') typ = self.parseType() return PointerType(typ) elif self.hasConsumed('procedure'): parameters, returntype = self.parseFormalParameters() return ProcedureType(parameters, returntype) else: self.Error('Unknown structured type "{0}"'.format(self.token.val)) # Variable declarations: def parseVariableDeclarations(self): if self.hasConsumed('var'): if self.token.typ == 'ID': while self.token.typ == 'ID': ids = self.parseIdentList() self.Consume(':') typename = self.parseType() self.Consume(';') for i in ids: v = Variable(i.name, typename, public=i.ispublic) self.setLocation(v, i.location) self.cst.addSymbol(v) else: self.Error('Expected ID, got'+str(self.token)) # Procedures def parseFPsection(self): if self.hasConsumed('const'): kind = 'const' elif self.hasConsumed('var'): kind = 'var' else: kind = 'value' names = [ self.Consume('ID') ] while self.hasConsumed(','): names.append( self.Consume('ID') ) self.Consume(':') typ = self.parseType() parameters = [Parameter(kind, name, typ) for name in names] return parameters def parseFormalParameters(self): parameters = [] self.Consume('(') if not self.hasConsumed(')'): parameters += self.parseFPsection() while self.hasConsumed(';'): parameters += self.parseFPsection() self.Consume(')') if self.hasConsumed(':'): returntype = self.parseQualIdent() else: returntype = void return ProcedureType(parameters, returntype) def parseProcedureDeclarations(self): procedures = [] while self.token.typ == 'procedure': p = self.parseProcedureDeclaration() procedures.append(p) self.Consume(';') return procedures def parseProcedureDeclaration(self): loc = self.getLocation() self.Consume('procedure') i = self.parseIdentDef() procname = i.name proctyp = self.parseFormalParameters() procsymtable = SymbolTable(parent = self.cst) self.cst = procsymtable # Switch symbol table: # Add parameters as variables to symbol table: for parameter in proctyp.parameters: vname = parameter.name vtyp = parameter.typ if parameter.kind == 'var': vtyp = PointerType(vtyp) variable = Variable(vname, vtyp, False) if parameter.kind == 'const': variable.isReadOnly = True variable.isParameter = True self.cst.addSymbol(variable) self.Consume(';') self.parseDeclarationSequence() # Mark all variables as local: for variable in self.cst.getAllLocal(Variable): variable.isLocal = True if self.hasConsumed('begin'): block = self.parseStatementSequence() if self.hasConsumed('return'): returnexpression = self.parseExpression() else: returnexpression = None if proctyp.returntype.isType(void): if not returnexpression is None: self.Error('Void procedure cannot return a value') else: if returnexpression is None: self.Error('Procedure must return a value') if not isType(returnexpression.typ, proctyp.returntype): self.Error('Returned type {0} does not match function return type {1}'.format(returnexpression.typ, proctyp.returntype)) self.Consume('end') endname = self.Consume('ID') if endname != procname: self.Error('endname should match {0}'.format(name)) self.cst = procsymtable.parent # Switch back to parent symbol table proc = Procedure(procname, proctyp, block, procsymtable, returnexpression) self.setLocation(proc, loc) self.cst.addSymbol(proc) proc.public = i.ispublic return proc # Statements: def parseAssignment(self, lval): loc = self.getLocation() self.Consume(':=') rval = self.parseExpression() if isType(lval.typ, real) and isType(rval.typ, integer): rval = Unop(rval, 'INTTOREAL', real) if type(rval.typ) is NilType: if not type(lval.typ) is ProcedureType and not type(lval.typ) is PointerType: self.Error('Can assign nil only to pointers or procedure types, not {0}'.format(lval)) elif not isType(lval.typ, rval.typ): self.Error('Type mismatch {0} != {1}'.format(lval.typ, rval.typ)) return self.setLocation(Assignment(lval, rval), loc) def parseExpressionList(self): expressions = [ self.parseExpression() ] while self.hasConsumed(','): expressions.append( self.parseExpression() ) return expressions def parseProcedureCall(self, procedure): self.Consume('(') if self.token.typ != ')': args = self.parseExpressionList() else: args = [] self.Consume(')') parameters = procedure.typ.parameters if len(args) != len(parameters): self.Error("Procedure requires {0} arguments, {1} given".format(len(parameters), len(args))) for arg, param in zip(args, parameters): if not arg.typ.isType(param.typ): print(arg.typ, param.typ) self.Error('Mismatch in parameter') return ProcedureCall(procedure, args) def parseIfStatement(self): loc = self.getLocation() self.Consume('if') ifs = [] condition = self.parseExpression() if not isType(condition.typ, boolean): self.Error('condition of if statement must be boolean') self.Consume('then') truestatement = self.parseStatementSequence() ifs.append( (condition, truestatement) ) while self.hasConsumed('elsif'): condition = self.parseExpression() if not isType(condition.typ, boolean): self.Error('condition of if statement must be boolean') self.Consume('then') truestatement = self.parseStatementSequence() ifs.append( (condition, truestatement) ) if self.hasConsumed('else'): statement = self.parseStatementSequence() else: statement = None self.Consume('end') for condition, truestatement in reversed(ifs): statement = IfStatement(condition, truestatement, statement) return self.setLocation(statement, loc) def parseCase(self): # TODO pass def parseCaseStatement(self): self.Consume('case') expr = self.parseExpression() self.Consume('of') self.parseCase() while self.hasConsumed('|'): self.parseCase() self.Consume('end') def parseWhileStatement(self): loc = self.getLocation() self.Consume('while') condition = self.parseExpression() self.Consume('do') statements = self.parseStatementSequence() if self.hasConsumed('elsif'): self.Error('elsif in while not yet implemented') self.Consume('end') return self.setLocation(WhileStatement(condition, statements), loc) def parseRepeatStatement(self): self.Consume('repeat') stmt = self.parseStatementSequence() self.Consume('until') cond = self.parseBoolExpression() def parseForStatement(self): loc = self.getLocation() self.Consume('for') variable = self.parseDesignator() if not variable.typ.isType(integer): self.Error('loop variable of for statement must have integer type') assert(variable.typ.isType(integer)) self.Consume(':=') begin = self.parseExpression() if not begin.typ.isType(integer): self.Error('begin expression of a for statement must have integer type') self.Consume('to') end = self.parseExpression() if not end.typ.isType(integer): self.Error('end expression of a for statement must have integer type') if self.hasConsumed('by'): increment, typ = self.parseConstExpression() if not typ.isType(integer): self.Error('Increment must be integer') else: increment = 1 assert(type(increment) is int) self.Consume('do') statements = self.parseStatementSequence() self.Consume('end') return self.setLocation(ForStatement(variable, begin, end, increment, statements), loc) def parseAsmcode(self): # TODO: move this to seperate file def parseOpcode(): return self.Consume('ID') def parseOperand(): if self.hasConsumed('['): memref = [] memref.append(parseOperand()) self.Consume(']') return memref else: if self.token.typ == 'NUMBER': return self.Consume('NUMBER') else: ID = self.Consume('ID') if self.cst.has(Variable, ID): return self.cst.get(Variable, ID) else: return ID def parseOperands(n): operands = [] if n > 0: operands.append( parseOperand() ) n = n - 1 while n > 0: self.Consume(',') operands.append(parseOperand()) n = n - 1 return operands self.Consume('asm') asmcode = [] while self.token.typ != 'end': opcode = parseOpcode() func, numargs = assembler.opcodes[opcode] operands = parseOperands(numargs) asmcode.append( (opcode, operands) ) #print('opcode', opcode, operands) self.Consume('end') return AsmCode(asmcode) def parseStatement(self): try: # Determine statement type based on the pending token: if self.token.typ == 'if': return self.parseIfStatement() elif self.token.typ == 'case': return self.parseCaseStatement() elif self.token.typ == 'while': return self.parseWhileStatement() elif self.token.typ == 'repeat': return self.parseRepeatStatement() elif self.token.typ == 'for': return self.parseForStatement() elif self.token.typ == 'asm': return self.parseAsmcode() elif self.token.typ == 'ID': # Assignment or procedure call designator = self.parseDesignator() if self.token.typ == '(' and type(designator.typ) is ProcedureType: return self.parseProcedureCall(designator) elif self.token.typ == ':=': return self.parseAssignment(designator) else: self.Error('Unknown statement following designator: {0}'.format(self.token)) else: # TODO: return empty statement??: return EmptyStatement() self.Error('Unknown statement {0}'.format(self.token)) except CompilerException as e: print(e) self.errorlist.append( (e.row, e.col, e.msg)) # Do error recovery by skipping all tokens until next ; or end while not (self.token.typ == ';' or self.token.typ == 'end'): self.Consume(self.token.typ) return EmptyStatement() def parseStatementSequence(self): """ Sequence of statements seperated by ';' """ statements = [ self.parseStatement() ] while self.hasConsumed(';'): statements.append( self.parseStatement() ) return StatementSequence( statements ) # Parsing expressions: """ grammar of expressions: expression = SimpleExpression [ reloperator SimpleExpression ] reloperator = '=' | '<=' | '>=' | '<>' Simpleexpression = [ '+' | '-' ] term { addoperator term } addoperator = '+' | '-' | 'or' term = factor { muloperator factor } muloperator = '*' | '/' | 'div' | 'mod' | 'and' factor = number | nil | true | false | "(" expression ")" | designator [ actualparameters ] | 'not' factor """ def parseExpression(self): """ The connector between the boolean and expression domain """ expr = self.parseSimpleExpression() if self.token.typ in ['>=','<=','<','>','<>','=']: relop = self.Consume() expr2 = self.parseSimpleExpression() # Automatic type convert to reals: if isType(expr.typ, real) and isType(expr2.typ, integer): expr2 = Unop(expr2, 'INTTOREAL', real) if isType(expr2.typ, real) and isType(expr.typ, integer): expr = Unop(expr, 'INTTOREAL', real) # Type check: if not isType(expr.typ, expr2.typ): self.Error('Type mismatch in relop') if isType(expr.typ, real) and relop in ['<>', '=']: self.Error('Cannot check real values for equality') expr = Relop(expr, relop, expr2, boolean) return expr # Parsing arithmatic expressions: def parseTerm(self): a = self.parseFactor() while self.token.typ in ['*', '/', 'mod', 'div', 'and']: loc = self.getLocation() op = self.Consume() b = self.parseTerm() # Type determination and checking: if op in ['mod', 'div']: if not isType(a.typ, integer): self.Error('First operand should be integer, not {0}'.format(a.typ)) if not isType(b.typ, integer): self.Error('Second operand should be integer, not {0}'.format(b.typ)) typ = integer elif op == '*': if isType(a.typ, integer) and isType(b.typ, integer): typ = integer elif isType(a.typ, real) or isType(b.typ, real): if isType(a.typ, integer): # Automatic type cast a = Unop(a, 'INTTOREAL', real) if isType(b.typ, integer): b = Unop(b, 'INTTOREAL', real) if not isType(a.typ, real): self.Error('first operand must be a real!') if not isType(b.typ, real): self.Error('second operand must be a real!') typ = real else: self.Error('Unknown operands for multiply: {0}, {1}'.format(a, b)) elif op == '/': # Division always yields a real result, for integer division use div if isType(a.typ, integer): # Automatic type cast a = Unop(a, 'INTTOREAL', real) if isType(b.typ, integer): b = Unop(b, 'INTTOREAL', real) if not isType(a.typ, real): self.Error('first operand must be a real!') if not isType(b.typ, real): self.Error('second operand must be a real!') typ = real elif op == 'and': if not isType(a.typ, boolean): self.Error('First operand of and must be boolean') if not isType(b.typ, boolean): self.Error('Second operand of and must be boolean') typ = boolean else: self.Error('Unknown operand {0}'.format(op)) a = self.setLocation(Binop(a, op, b, typ), loc) return a def parseFactor(self): if self.hasConsumed('('): e = self.parseExpression() self.Consume(')') return e elif self.token.typ == 'NUMBER': loc = self.getLocation() val = self.Consume('NUMBER') return self.setLocation(Constant(val, integer), loc) elif self.token.typ == 'REAL': loc = self.getLocation() val = self.Consume('REAL') return self.setLocation(Constant(val, real), loc) elif self.token.typ == 'CHAR': val = self.Consume('CHAR') return Constant(val, char) elif self.token.typ == 'STRING': txt = self.Consume('STRING') return StringConstant(txt) elif self.token.typ in ['true', 'false']: val = self.Consume() val = True if val == 'true' else False return Constant(val, boolean) elif self.hasConsumed('nil'): return Constant(0, NilType()) elif self.hasConsumed('not'): f = self.parseFactor() if not isType(f.typ, boolean): self.Error('argument of boolean negation must be boolean type') return Unop(f, 'not', boolean) elif self.token.typ == 'ID': designator = self.parseDesignator() # TODO: handle functions different here? if self.token.typ == '(' and type(designator.typ) is ProcedureType: return self.parseProcedureCall(designator) else: return designator else: self.Error('Expected NUMBER, ID or ( expr ), got'+str(self.token)) def parseSimpleExpression(self): """ Arithmatic expression """ if self.token.typ in ['+', '-']: # Handle the unary minus op = self.Consume() a = self.parseTerm() typ = a.typ if not isType(typ,real) and not isType(typ, integer): self.Error('Unary minus or plus can be only applied to real or integers') if op == '-': a = Unop(a, op, typ) else: a = self.parseTerm() while self.token.typ in ['+', '-', 'or']: loc = self.getLocation() op = self.Consume() b = self.parseTerm() if op in ['+', '-']: if isType(a.typ, real) or isType(b.typ, real): typ = real if isType(a.typ, integer): # Automatic type cast a = Unop(a, 'INTTOREAL', real) if not isType(a.typ, real): self.Error('first operand must be a real!') if isType(b.typ, integer): b = Unop(b, 'INTTOREAL', real) if not isType(b.typ, real): self.Error('second operand must be a real!') elif isType(a.typ, integer) and isType(b.typ, integer): typ = integer else: self.Error('Invalid types {0} and {1}'.format(a.typ, b.typ)) elif op == 'or': if not isType(a.typ, boolean): self.Error('first operand must be boolean for or operation') if not isType(b.typ, boolean): self.Error('second operand must be boolean for or operation') typ = boolean else: self.Error('Unknown operand {0}'.format(op)) a = self.setLocation(Binop(a, op, b, typ), loc) return a