from ply import lex from ply.lex import LexError import io # Definition class LexerDef(): # Spacing and comments t_ignore_COMMENT = r"\#[^\n\r]*" t_ignore = " \t" def t_NEWLINES(self,token): r"\n|\r\n?" token.lexer.lineno += 1 # Values rules def t_VALUE_NULL(self, token): r"null" token.value = None return token def t_VALUE_BOOL(self, token): r"true|false" token.value = token.value == "true" return token def t_VALUE_NUMBER(self, token): r"([1-9][0-9]*)|(0([0-9]+|(o|O)[0-7]+|(x|X)[0-9a-fA-F]+|(b|B)[0-1]+)?)" if token.value.startswith("0") and len(token.value) > 1: if "0" <= token.value[1] <= "9": raise LexError("Int can't start with a leading zero %s." % (str(token),), token.value) elif token.value[1] in ("x", "X"): # Hex token.value = int(token.value[2:].lower(), 16) elif token.value[1] in ("o", "O"): # Octet token.value = int(token.value[2:].lower(), 8) elif token.value[1] in ("b", "B"): # Binary token.value = int(token.value[2:].lower(), 2) else: raise LexError("Unknown int encoding %s." % (repr(token.value),), token.value) else: # Default int token.value = int(token.value) return token def __int_to_unicode(self, number:int): if number < 0 or number >= 2 ** 32: raise LexError("Can't read unicode char greater then 2^32 or below 0.", "") return bytes([(number >> 0) % 256, (number >> 8) % 256, (number >> 16) % 256, (number >> 24) % 256]).decode("UTF-32-LE") _string_replacement = {"\\": "\\", "'": "'", "a": "\x07", "b": "\x08", "f": "\x0c", "n": "\n", "r": "\r", "t": "\t", "v": "\v"} def t_VALUE_STRING(self, token): r"'''(\\.|[^\\'])*'''|'(\\.|[^\n\\'])*'" # Initalize if token.value.startswith("'''"): input_buffer = io.StringIO(token.value[3:-3]) else: input_buffer = io.StringIO(token.value[1:-1]) result_buffer = io.StringIO() # Make escaping read = input_buffer.read(1) while read: if read == "\\": read = input_buffer.read(1) if not read: raise LexError("Can't parse escped string.", token.value) # TODO: Give line number elif read in self._string_replacement: result_buffer.write(self._string_replacement[read]) elif "0" <= read <= "7": tmp = read + input_buffer.read(2) if len(tmp) < 2: raise LexError("Can't read hex %s." % (str(token),), token.value) result_buffer.write(self.__int_to_unicode(int(tmp, 8))) elif read in ("x", "X"): # 2 hex unicode char tmp = input_buffer.read(2) if len(tmp) < 2: raise LexError("Can't read hex %s." % (str(token),), token.value) result_buffer.write(self.__int_to_unicode(int(tmp, 16))) elif "u" == read: # 4 hex unicode char tmp = input_buffer.read(4) if len(tmp) < 4: raise LexError("Can't read hex %s." % (str(token),), token.value) result_buffer.write(self.__int_to_unicode(int(tmp, 16))) elif "U" == read: # 8 hex unicode char tmp = input_buffer.read(8) if len(tmp) < 8: raise LexError("Can't read hex %s." % (str(token),), token.value) result_buffer.write(self.__int_to_unicode(int(tmp, 16))) else: raise LexError("Can't parse escped string %s." % (token,), token.value) # TODO: Give line number else: result_buffer.write(read) read = input_buffer.read(1) # Output result string token.value = result_buffer.getvalue() return token # Build lexer def __init__(self): tokens = [] for i in filter(lambda x: x.startswith("t_"), dir(self)): tokens.append(i[2:]) self.tokens = tuple(tokens) def t_error(self, token): raise LexError("Can't lex %s." % (str(token),), token.value) def build(self): return lex.lex(module=self) # Gen lexer def gen_lexer(): return LexerDef().build()