Mercurial > lcfOS
diff python/libs/compiler/parsergen.py @ 69:60cc36ef5a50
Rework in compiler
author | windel |
---|---|
date | Sat, 27 Oct 2012 14:31:58 +0200 |
parents | python/libs/compiler/parser.py@32078200cdd6 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/libs/compiler/parsergen.py Sat Oct 27 14:31:58 2012 +0200 @@ -0,0 +1,787 @@ +""" + This module parses source code into an abstract syntax tree (AST) +""" + +from .symboltable import SymbolTable +from .nodes import * +from .errors import CompilerException, Error +from .modules import loadModule +from .display import printNode +from .builtin import * +from . import assembler + +class Parser: + def __init__(self, tokens): + """ provide the parser with the tokens iterator from the lexer. """ + self.tokens = tokens + self.NextToken() + self.errorlist = [] + + def Error(self, msg): + raise CompilerException(msg, self.token.row, self.token.col) + + # Lexer helpers: + def Consume(self, typ=''): + if self.token.typ == typ or typ == '': + v = self.token.val + self.NextToken() + return v + else: + self.Error('Excected: "{0}", got "{1}"'.format(typ, self.token.val)) + + def hasConsumed(self, typ): + if self.token.typ == typ: + self.Consume(typ) + return True + return False + + def NextToken(self): + self.token = self.tokens.__next__() + # TODO: store filename in location? + self.location = (self.token.row, self.token.col) + + # Helpers to find location of the error in the code: + def setLocation(self, obj, location): + obj.location = location + return obj + def getLocation(self): + return self.location + + """ + Recursive descent parser functions: + A set of mutual recursive functions. + Starting symbol is the Module. + """ + def parseModule(self): + self.imports = [] + loc = self.getLocation() + self.Consume('module') + modname = self.Consume('ID') + self.Consume(';') + mod = Module(modname) + + # Construct a symbol table for this program + mod.symtable = SymbolTable() + # Add built in types and functions: + for x in [real, integer, boolean, char, chr_func]: + mod.symtable.addSymbol(x) + + self.cst = mod.symtable + self.parseImportList() + + self.parseDeclarationSequence() + # Procedures only allowed in this scope + self.parseProcedureDeclarations() + + if self.hasConsumed('begin'): + mod.initcode = self.parseStatementSequence() + else: + mod.initcode = EmptyStatement() + + self.Consume('end') + endname = self.Consume('ID') + if endname != modname: + self.Error('end denoter must be module name') + self.Consume('.') + + mod.imports = self.imports + return self.setLocation(mod, loc) + + # Import part + def parseImportList(self): + if self.hasConsumed('import'): + self.parseImport() + while self.hasConsumed(','): + self.parseImport() + self.Consume(';') + + def parseImport(self): + loc = self.getLocation() + modname = self.Consume('ID') + mod = loadModule(modname) + self.setLocation(mod, loc) + self.cst.addSymbol(mod) + + # Helper to parse an identifier defenitions + def parseIdentDef(self): + loc = self.getLocation() + name = self.Consume('ID') + ispublic = self.hasConsumed('*') + # Make a node of this thing: + i = Id(name) + i.ispublic = ispublic + return self.setLocation(i, loc) + + def parseIdentList(self): + ids = [ self.parseIdentDef() ] + while self.hasConsumed(','): + ids.append( self.parseIdentDef() ) + return ids + + def parseQualIdent(self): + """ Parse a qualified identifier """ + name = self.Consume('ID') + if self.cst.has(Module, name): + modname = name + mod = self.cst.get(Module, modname) + self.Consume('.') + name = self.Consume('ID') + # Try to find existing imported symbol: + for imp in self.imports: + if imp.modname == modname and imp.name == name: + return imp + # Try to find the symbol in the modules exports: + for sym in mod.exports: + if sym.name == name: + impsym = ImportedSymbol(modname, name) + impsym.typ = sym.typ + impsym.signature = mod.signature + self.imports.append(impsym) + return impsym + self.Error("Cannot find symbol {0}".format(name)) + else: + return self.cst.getSymbol(name) + + # Helper to parse a designator + def parseDesignator(self): + """ A designator designates an object. + The base location in memory is denoted by the qualified identifier + The actual address depends on the selector. + """ + loc = self.getLocation() + obj = self.parseQualIdent() + typ = obj.typ + selectors = [] + while self.token.typ in ['.', '[', '^']: + if self.hasConsumed('.'): + field = self.Consume('ID') + if typ is PointerType: + selectors.append(Deref()) + typ = typ.pointedType + if not type(typ) is RecordType: + self.Error("field reference, type not record but {0}".format(typ)) + typ = typ.fields[field] + selectors.append(Field(field)) + elif self.hasConsumed('['): + indexes = self.parseExpressionList() + self.Consume(']') + for idx in indexes: + if not type(typ) is ArrayType: + self.Error('Cannot index non array type') + if not isType(idx.typ, integer): + self.Error('Only integer expressions can be used as an index') + selectors.append(Index(idx, typ)) + typ = typ.elementType + elif self.hasConsumed('^'): + selectors.append(Deref()) + typ = typ.pointedType + return self.setLocation(Designator(obj, selectors, typ), loc) + + # Declaration sequence + def parseDeclarationSequence(self): + """ 1. constants, 2. types, 3. variables """ + self.parseConstantDeclarations() + self.parseTypeDeclarations() + self.parseVariableDeclarations() + + # Constants + def evalExpression(self, expr): + if type(expr) is Binop: + a = self.evalExpression(expr.a) + b = self.evalExpression(expr.b) + if expr.op == '+': + return a + b + elif expr.op == '-': + return a - b + elif expr.op == '*': + return a * b + elif expr.op == '/': + return float(a) / float(b) + elif expr.op == 'mod': + return int(a % b) + elif expr.op == 'div': + return int(a / b) + elif expr.op == 'or': + return a or b + elif expr.op == 'and': + return a and b + else: + self.Error('Cannot evaluate expression with {0}'.format(expr.op)) + elif type(expr) is Constant: + return expr.value + elif type(expr) is Designator: + if type(expr.obj) is Constant: + return self.evalExpression(expr.obj) + else: + self.Error('Cannot evaluate designated object {0}'.format(expr.obj)) + elif type(expr) is Unop: + a = self.evalExpression(expr.a) + if expr.op == 'not': + return not a + elif expr.op == '-': + return -a + else: + self.Error('Unimplemented unary operation {0}'.format(expr.op)) + else: + self.Error('Cannot evaluate expression {0}'.format(expr)) + + def parseConstExpression(self): + e = self.parseExpression() + return self.evalExpression(e), e.typ + + def parseConstantDeclarations(self): + """ Parse const part of a module """ + if self.hasConsumed('const'): + while self.token.typ == 'ID': + i = self.parseIdentDef() + self.Consume('=') + constvalue, typ = self.parseConstExpression() + self.Consume(';') + c = Constant(constvalue, typ, name=i.name, public=i.ispublic) + self.setLocation(c, i.location) + self.cst.addSymbol(c) + + # Type system + def parseTypeDeclarations(self): + if self.hasConsumed('type'): + while self.token.typ == 'ID': + typename, export = self.parseIdentDef() + self.Consume('=') + typ = self.parseStructuredType() + self.Consume(';') + t = DefinedType(typename, typ) + self.cst.addSymbol(t) + + def parseType(self): + if self.token.typ == 'ID': + typename = self.Consume('ID') + if self.cst.has(Type, typename): + typ = self.cst.get(Type, typename) + while type(typ) is DefinedType: + typ = typ.typ + return typ + else: + self.Error('Cannot find type {0}'.format(typename)) + else: + return self.parseStructuredType() + + def parseStructuredType(self): + if self.hasConsumed('array'): + dimensions = [] + dimensions.append( self.parseConstExpression() ) + while self.hasConsumed(','): + dimensions.append( self.parseConstExpression() ) + self.Consume('of') + arr = self.parseType() + for dimension, consttyp in reversed(dimensions): + if not isType(consttyp, integer): + self.Error('array dimension must be an integer type (not {0})'.format(consttyp)) + if dimension < 2: + self.Error('array dimension must be bigger than 1 (not {0})'.format(dimension)) + arr = ArrayType(dimension, arr) + return arr + elif self.hasConsumed('record'): + fields = {} + while self.token.typ == 'ID': + # parse a fieldlist: + identifiers = self.parseIdentList() + self.Consume(':') + typ = self.parseType() + self.Consume(';') + for i in identifiers: + if i.name in fields.keys(): + self.Error('record field "{0}" multiple defined.'.format(i.name)) + fields[i.name] = typ + # TODO store this in another way, symbol table? + self.Consume('end') + return RecordType(fields) + elif self.hasConsumed('pointer'): + self.Consume('to') + typ = self.parseType() + return PointerType(typ) + elif self.hasConsumed('procedure'): + parameters, returntype = self.parseFormalParameters() + return ProcedureType(parameters, returntype) + else: + self.Error('Unknown structured type "{0}"'.format(self.token.val)) + + # Variable declarations: + def parseVariableDeclarations(self): + if self.hasConsumed('var'): + if self.token.typ == 'ID': + while self.token.typ == 'ID': + ids = self.parseIdentList() + self.Consume(':') + typename = self.parseType() + self.Consume(';') + for i in ids: + v = Variable(i.name, typename, public=i.ispublic) + self.setLocation(v, i.location) + self.cst.addSymbol(v) + else: + self.Error('Expected ID, got'+str(self.token)) + + # Procedures + def parseFPsection(self): + if self.hasConsumed('const'): + kind = 'const' + elif self.hasConsumed('var'): + kind = 'var' + else: + kind = 'value' + names = [ self.Consume('ID') ] + while self.hasConsumed(','): + names.append( self.Consume('ID') ) + self.Consume(':') + typ = self.parseType() + parameters = [Parameter(kind, name, typ) + for name in names] + return parameters + + def parseFormalParameters(self): + parameters = [] + self.Consume('(') + if not self.hasConsumed(')'): + parameters += self.parseFPsection() + while self.hasConsumed(';'): + parameters += self.parseFPsection() + self.Consume(')') + if self.hasConsumed(':'): + returntype = self.parseQualIdent() + else: + returntype = void + return ProcedureType(parameters, returntype) + + def parseProcedureDeclarations(self): + procedures = [] + while self.token.typ == 'procedure': + p = self.parseProcedureDeclaration() + procedures.append(p) + self.Consume(';') + return procedures + + def parseProcedureDeclaration(self): + loc = self.getLocation() + self.Consume('procedure') + i = self.parseIdentDef() + procname = i.name + proctyp = self.parseFormalParameters() + procsymtable = SymbolTable(parent = self.cst) + self.cst = procsymtable # Switch symbol table: + # Add parameters as variables to symbol table: + for parameter in proctyp.parameters: + vname = parameter.name + vtyp = parameter.typ + if parameter.kind == 'var': + vtyp = PointerType(vtyp) + variable = Variable(vname, vtyp, False) + if parameter.kind == 'const': + variable.isReadOnly = True + variable.isParameter = True + self.cst.addSymbol(variable) + self.Consume(';') + self.parseDeclarationSequence() + # Mark all variables as local: + for variable in self.cst.getAllLocal(Variable): + variable.isLocal = True + + if self.hasConsumed('begin'): + block = self.parseStatementSequence() + if self.hasConsumed('return'): + returnexpression = self.parseExpression() + else: + returnexpression = None + + if proctyp.returntype.isType(void): + if not returnexpression is None: + self.Error('Void procedure cannot return a value') + else: + if returnexpression is None: + self.Error('Procedure must return a value') + if not isType(returnexpression.typ, proctyp.returntype): + self.Error('Returned type {0} does not match function return type {1}'.format(returnexpression.typ, proctyp.returntype)) + + self.Consume('end') + endname = self.Consume('ID') + if endname != procname: + self.Error('endname should match {0}'.format(name)) + self.cst = procsymtable.parent # Switch back to parent symbol table + proc = Procedure(procname, proctyp, block, procsymtable, returnexpression) + self.setLocation(proc, loc) + self.cst.addSymbol(proc) + proc.public = i.ispublic + return proc + + # Statements: + def parseAssignment(self, lval): + loc = self.getLocation() + self.Consume(':=') + rval = self.parseExpression() + if isType(lval.typ, real) and isType(rval.typ, integer): + rval = Unop(rval, 'INTTOREAL', real) + if type(rval.typ) is NilType: + if not type(lval.typ) is ProcedureType and not type(lval.typ) is PointerType: + self.Error('Can assign nil only to pointers or procedure types, not {0}'.format(lval)) + elif not isType(lval.typ, rval.typ): + self.Error('Type mismatch {0} != {1}'.format(lval.typ, rval.typ)) + return self.setLocation(Assignment(lval, rval), loc) + + def parseExpressionList(self): + expressions = [ self.parseExpression() ] + while self.hasConsumed(','): + expressions.append( self.parseExpression() ) + return expressions + + def parseProcedureCall(self, procedure): + self.Consume('(') + if self.token.typ != ')': + args = self.parseExpressionList() + else: + args = [] + self.Consume(')') + parameters = procedure.typ.parameters + if len(args) != len(parameters): + self.Error("Procedure requires {0} arguments, {1} given".format(len(parameters), len(args))) + for arg, param in zip(args, parameters): + if not arg.typ.isType(param.typ): + print(arg.typ, param.typ) + self.Error('Mismatch in parameter') + return ProcedureCall(procedure, args) + + def parseIfStatement(self): + loc = self.getLocation() + self.Consume('if') + ifs = [] + condition = self.parseExpression() + if not isType(condition.typ, boolean): + self.Error('condition of if statement must be boolean') + self.Consume('then') + truestatement = self.parseStatementSequence() + ifs.append( (condition, truestatement) ) + while self.hasConsumed('elsif'): + condition = self.parseExpression() + if not isType(condition.typ, boolean): + self.Error('condition of if statement must be boolean') + self.Consume('then') + truestatement = self.parseStatementSequence() + ifs.append( (condition, truestatement) ) + if self.hasConsumed('else'): + statement = self.parseStatementSequence() + else: + statement = None + self.Consume('end') + for condition, truestatement in reversed(ifs): + statement = IfStatement(condition, truestatement, statement) + return self.setLocation(statement, loc) + + def parseCase(self): + # TODO + pass + + def parseCaseStatement(self): + self.Consume('case') + expr = self.parseExpression() + self.Consume('of') + self.parseCase() + while self.hasConsumed('|'): + self.parseCase() + self.Consume('end') + + def parseWhileStatement(self): + loc = self.getLocation() + self.Consume('while') + condition = self.parseExpression() + self.Consume('do') + statements = self.parseStatementSequence() + if self.hasConsumed('elsif'): + self.Error('elsif in while not yet implemented') + self.Consume('end') + return self.setLocation(WhileStatement(condition, statements), loc) + + def parseRepeatStatement(self): + self.Consume('repeat') + stmt = self.parseStatementSequence() + self.Consume('until') + cond = self.parseBoolExpression() + + def parseForStatement(self): + loc = self.getLocation() + self.Consume('for') + variable = self.parseDesignator() + if not variable.typ.isType(integer): + self.Error('loop variable of for statement must have integer type') + assert(variable.typ.isType(integer)) + self.Consume(':=') + begin = self.parseExpression() + if not begin.typ.isType(integer): + self.Error('begin expression of a for statement must have integer type') + self.Consume('to') + end = self.parseExpression() + if not end.typ.isType(integer): + self.Error('end expression of a for statement must have integer type') + if self.hasConsumed('by'): + increment, typ = self.parseConstExpression() + if not typ.isType(integer): + self.Error('Increment must be integer') + else: + increment = 1 + assert(type(increment) is int) + self.Consume('do') + statements = self.parseStatementSequence() + self.Consume('end') + return self.setLocation(ForStatement(variable, begin, end, increment, statements), loc) + + def parseAsmcode(self): + # TODO: move this to seperate file + def parseOpcode(): + return self.Consume('ID') + def parseOperand(): + if self.hasConsumed('['): + memref = [] + memref.append(parseOperand()) + self.Consume(']') + return memref + else: + if self.token.typ == 'NUMBER': + return self.Consume('NUMBER') + else: + ID = self.Consume('ID') + if self.cst.has(Variable, ID): + return self.cst.get(Variable, ID) + else: + return ID + + def parseOperands(n): + operands = [] + if n > 0: + operands.append( parseOperand() ) + n = n - 1 + while n > 0: + self.Consume(',') + operands.append(parseOperand()) + n = n - 1 + return operands + self.Consume('asm') + asmcode = [] + while self.token.typ != 'end': + opcode = parseOpcode() + func, numargs = assembler.opcodes[opcode] + operands = parseOperands(numargs) + asmcode.append( (opcode, operands) ) + #print('opcode', opcode, operands) + self.Consume('end') + return AsmCode(asmcode) + + def parseStatement(self): + try: + # Determine statement type based on the pending token: + if self.token.typ == 'if': + return self.parseIfStatement() + elif self.token.typ == 'case': + return self.parseCaseStatement() + elif self.token.typ == 'while': + return self.parseWhileStatement() + elif self.token.typ == 'repeat': + return self.parseRepeatStatement() + elif self.token.typ == 'for': + return self.parseForStatement() + elif self.token.typ == 'asm': + return self.parseAsmcode() + elif self.token.typ == 'ID': + # Assignment or procedure call + designator = self.parseDesignator() + if self.token.typ == '(' and type(designator.typ) is ProcedureType: + return self.parseProcedureCall(designator) + elif self.token.typ == ':=': + return self.parseAssignment(designator) + else: + self.Error('Unknown statement following designator: {0}'.format(self.token)) + else: + # TODO: return empty statement??: + return EmptyStatement() + self.Error('Unknown statement {0}'.format(self.token)) + except CompilerException as e: + print(e) + self.errorlist.append( (e.row, e.col, e.msg)) + # Do error recovery by skipping all tokens until next ; or end + while not (self.token.typ == ';' or self.token.typ == 'end'): + self.Consume(self.token.typ) + return EmptyStatement() + + def parseStatementSequence(self): + """ Sequence of statements seperated by ';' """ + statements = [ self.parseStatement() ] + while self.hasConsumed(';'): + statements.append( self.parseStatement() ) + return StatementSequence( statements ) + + # Parsing expressions: + """ + grammar of expressions: + expression = SimpleExpression [ reloperator SimpleExpression ] + reloperator = '=' | '<=' | '>=' | '<>' + Simpleexpression = [ '+' | '-' ] term { addoperator term } + addoperator = '+' | '-' | 'or' + term = factor { muloperator factor } + muloperator = '*' | '/' | 'div' | 'mod' | 'and' + factor = number | nil | true | false | "(" expression ")" | + designator [ actualparameters ] | 'not' factor + """ + def parseExpression(self): + """ The connector between the boolean and expression domain """ + expr = self.parseSimpleExpression() + if self.token.typ in ['>=','<=','<','>','<>','=']: + relop = self.Consume() + expr2 = self.parseSimpleExpression() + # Automatic type convert to reals: + if isType(expr.typ, real) and isType(expr2.typ, integer): + expr2 = Unop(expr2, 'INTTOREAL', real) + if isType(expr2.typ, real) and isType(expr.typ, integer): + expr = Unop(expr, 'INTTOREAL', real) + # Type check: + if not isType(expr.typ, expr2.typ): + self.Error('Type mismatch in relop') + if isType(expr.typ, real) and relop in ['<>', '=']: + self.Error('Cannot check real values for equality') + + expr = Relop(expr, relop, expr2, boolean) + return expr + + # Parsing arithmatic expressions: + def parseTerm(self): + a = self.parseFactor() + while self.token.typ in ['*', '/', 'mod', 'div', 'and']: + loc = self.getLocation() + op = self.Consume() + b = self.parseTerm() + # Type determination and checking: + if op in ['mod', 'div']: + if not isType(a.typ, integer): + self.Error('First operand should be integer, not {0}'.format(a.typ)) + if not isType(b.typ, integer): + self.Error('Second operand should be integer, not {0}'.format(b.typ)) + typ = integer + elif op == '*': + if isType(a.typ, integer) and isType(b.typ, integer): + typ = integer + elif isType(a.typ, real) or isType(b.typ, real): + if isType(a.typ, integer): + # Automatic type cast + a = Unop(a, 'INTTOREAL', real) + if isType(b.typ, integer): + b = Unop(b, 'INTTOREAL', real) + if not isType(a.typ, real): + self.Error('first operand must be a real!') + if not isType(b.typ, real): + self.Error('second operand must be a real!') + typ = real + else: + self.Error('Unknown operands for multiply: {0}, {1}'.format(a, b)) + elif op == '/': + # Division always yields a real result, for integer division use div + if isType(a.typ, integer): + # Automatic type cast + a = Unop(a, 'INTTOREAL', real) + if isType(b.typ, integer): + b = Unop(b, 'INTTOREAL', real) + if not isType(a.typ, real): + self.Error('first operand must be a real!') + if not isType(b.typ, real): + self.Error('second operand must be a real!') + typ = real + elif op == 'and': + if not isType(a.typ, boolean): + self.Error('First operand of and must be boolean') + if not isType(b.typ, boolean): + self.Error('Second operand of and must be boolean') + typ = boolean + else: + self.Error('Unknown operand {0}'.format(op)) + + a = self.setLocation(Binop(a, op, b, typ), loc) + return a + + def parseFactor(self): + if self.hasConsumed('('): + e = self.parseExpression() + self.Consume(')') + return e + elif self.token.typ == 'NUMBER': + loc = self.getLocation() + val = self.Consume('NUMBER') + return self.setLocation(Constant(val, integer), loc) + elif self.token.typ == 'REAL': + loc = self.getLocation() + val = self.Consume('REAL') + return self.setLocation(Constant(val, real), loc) + elif self.token.typ == 'CHAR': + val = self.Consume('CHAR') + return Constant(val, char) + elif self.token.typ == 'STRING': + txt = self.Consume('STRING') + return StringConstant(txt) + elif self.token.typ in ['true', 'false']: + val = self.Consume() + val = True if val == 'true' else False + return Constant(val, boolean) + elif self.hasConsumed('nil'): + return Constant(0, NilType()) + elif self.hasConsumed('not'): + f = self.parseFactor() + if not isType(f.typ, boolean): + self.Error('argument of boolean negation must be boolean type') + return Unop(f, 'not', boolean) + elif self.token.typ == 'ID': + designator = self.parseDesignator() + # TODO: handle functions different here? + if self.token.typ == '(' and type(designator.typ) is ProcedureType: + return self.parseProcedureCall(designator) + else: + return designator + else: + self.Error('Expected NUMBER, ID or ( expr ), got'+str(self.token)) + + def parseSimpleExpression(self): + """ Arithmatic expression """ + if self.token.typ in ['+', '-']: + # Handle the unary minus + op = self.Consume() + a = self.parseTerm() + typ = a.typ + if not isType(typ,real) and not isType(typ, integer): + self.Error('Unary minus or plus can be only applied to real or integers') + if op == '-': + a = Unop(a, op, typ) + else: + a = self.parseTerm() + while self.token.typ in ['+', '-', 'or']: + loc = self.getLocation() + op = self.Consume() + b = self.parseTerm() + if op in ['+', '-']: + if isType(a.typ, real) or isType(b.typ, real): + typ = real + if isType(a.typ, integer): + # Automatic type cast + a = Unop(a, 'INTTOREAL', real) + if not isType(a.typ, real): + self.Error('first operand must be a real!') + if isType(b.typ, integer): + b = Unop(b, 'INTTOREAL', real) + if not isType(b.typ, real): + self.Error('second operand must be a real!') + elif isType(a.typ, integer) and isType(b.typ, integer): + typ = integer + else: + self.Error('Invalid types {0} and {1}'.format(a.typ, b.typ)) + elif op == 'or': + if not isType(a.typ, boolean): + self.Error('first operand must be boolean for or operation') + if not isType(b.typ, boolean): + self.Error('second operand must be boolean for or operation') + typ = boolean + else: + self.Error('Unknown operand {0}'.format(op)) + a = self.setLocation(Binop(a, op, b, typ), loc) + return a +