Mercurial > lcfOS
view python/ppci/frontends/ks/lexer.py @ 99:6efbeb903777
movage
author | windel |
---|---|
date | Mon, 24 Dec 2012 15:03:30 +0100 |
parents | 3f772feb12ef |
children | af0d7913677a |
line wrap: on
line source
import collections import re from .errors import CompilerException """ Lexical analyzer part. Splits the input character stream into tokens. """ # Token is used in the lexical analyzer: Token = collections.namedtuple('Token', 'typ val row col') keywords = ['and', 'array', 'begin', 'by', 'case', 'const', 'div', 'do', \ 'else', 'elsif', 'end', 'false', 'for', 'if', 'import', 'in', 'is', \ 'mod', 'module', 'nil', 'not', 'of', 'or', 'pointer', 'procedure', \ 'record', 'repeat', 'return', 'then', 'to', 'true', 'type', 'until', 'var', \ 'while', 'asm' ] def tokenize(s): """ Tokenizer, generates an iterator that returns tokens! This GREAT example was taken from python re doc page! """ tok_spec = [ ('REAL', r'\d+\.\d+'), ('HEXNUMBER', r'0x[\da-fA-F]+'), ('NUMBER', r'\d+'), ('ID', r'[A-Za-z][A-Za-z\d_]*'), ('NEWLINE', r'\n'), ('SKIP', r'[ \t]'), ('COMMENTS', r'{.*}'), ('LEESTEKEN', r':=|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<'), ('STRING', r"'.*?'") ] tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) gettok = re.compile(tok_re).match line = 1 pos = line_start = 0 mo = gettok(s) while mo is not None: typ = mo.lastgroup val = mo.group(typ) if typ == 'NEWLINE': line_start = pos line += 1 elif typ == 'COMMENTS': pass elif typ != 'SKIP': if typ == 'ID': if val in keywords: typ = val elif typ == 'LEESTEKEN': typ = val elif typ == 'NUMBER': val = int(val) elif typ == 'HEXNUMBER': val = int(val[2:], 16) typ = 'NUMBER' elif typ == 'REAL': val = float(val) elif typ == 'STRING': val = val[1:-1] yield Token(typ, val, line, mo.start()-line_start) pos = mo.end() mo = gettok(s, pos) if pos != len(s): col = pos - line_start raise CompilerException('Unexpected character {0}'.format(s[pos]), line, col) yield Token('END', '', line, 0)