119 lines
4.7 KiB
Python
119 lines
4.7 KiB
Python
|
from ply import lex
|
||
|
from ply.lex import LexError
|
||
|
import io
|
||
|
|
||
|
# Definition
|
||
|
class LexerDef():
|
||
|
# Spacing and comments
|
||
|
t_ignore_COMMENT = r"\#[^\n\r]*"
|
||
|
t_ignore = " \t"
|
||
|
|
||
|
def t_NEWLINES(self,token):
|
||
|
r"\n|\r\n?"
|
||
|
token.lexer.lineno += 1
|
||
|
|
||
|
# Values rules
|
||
|
def t_VALUE_NULL(self, token):
|
||
|
r"null"
|
||
|
token.value = None
|
||
|
return token
|
||
|
def t_VALUE_BOOL(self, token):
|
||
|
r"true|false"
|
||
|
token.value = token.value == "true"
|
||
|
return token
|
||
|
def t_VALUE_NUMBER(self, token):
|
||
|
r"([1-9][0-9]*)|(0([0-9]+|(o|O)[0-7]+|(x|X)[0-9a-fA-F]+|(b|B)[0-1]+)?)"
|
||
|
if token.value.startswith("0") and len(token.value) > 1:
|
||
|
if "0" <= token.value[1] <= "9":
|
||
|
raise LexError("Int can't start with a leading zero %s." % (str(token),), token.value)
|
||
|
elif token.value[1] in ("x", "X"): # Hex
|
||
|
token.value = int(token.value[2:].lower(), 16)
|
||
|
elif token.value[1] in ("o", "O"): # Octet
|
||
|
token.value = int(token.value[2:].lower(), 8)
|
||
|
elif token.value[1] in ("b", "B"): # Binary
|
||
|
token.value = int(token.value[2:].lower(), 2)
|
||
|
else:
|
||
|
raise LexError("Unknown int encoding %s." % (repr(token.value),), token.value)
|
||
|
else: # Default int
|
||
|
token.value = int(token.value)
|
||
|
return token
|
||
|
|
||
|
def __int_to_unicode(self, number:int):
|
||
|
if number < 0 or number >= 2 ** 32:
|
||
|
raise LexError("Can't read unicode char greater then 2^32 or below 0.", "")
|
||
|
return bytes([(number >> 0) % 256, (number >> 8) % 256, (number >> 16) % 256, (number >> 24) % 256]).decode("UTF-32-LE")
|
||
|
_string_replacement = {"\\": "\\",
|
||
|
"'": "'",
|
||
|
"a": "\x07",
|
||
|
"b": "\x08",
|
||
|
"f": "\x0c",
|
||
|
"n": "\n",
|
||
|
"r": "\r",
|
||
|
"t": "\t",
|
||
|
"v": "\v"}
|
||
|
def t_VALUE_STRING(self, token):
|
||
|
r"'''(\\.|[^\\'])*'''|'(\\.|[^\n\\'])*'"
|
||
|
|
||
|
# Initalize
|
||
|
if token.value.startswith("'''"):
|
||
|
input_buffer = io.StringIO(token.value[3:-3])
|
||
|
else:
|
||
|
input_buffer = io.StringIO(token.value[1:-1])
|
||
|
result_buffer = io.StringIO()
|
||
|
|
||
|
# Make escaping
|
||
|
read = input_buffer.read(1)
|
||
|
while read:
|
||
|
if read == "\\":
|
||
|
read = input_buffer.read(1)
|
||
|
if not read:
|
||
|
raise LexError("Can't parse escped string.", token.value) # TODO: Give line number
|
||
|
elif read in self._string_replacement:
|
||
|
result_buffer.write(self._string_replacement[read])
|
||
|
elif "0" <= read <= "7":
|
||
|
tmp = read + input_buffer.read(2)
|
||
|
if len(tmp) < 2:
|
||
|
raise LexError("Can't read hex %s." % (str(token),), token.value)
|
||
|
result_buffer.write(self.__int_to_unicode(int(tmp, 8)))
|
||
|
elif read in ("x", "X"): # 2 hex unicode char
|
||
|
tmp = input_buffer.read(2)
|
||
|
if len(tmp) < 2:
|
||
|
raise LexError("Can't read hex %s." % (str(token),), token.value)
|
||
|
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
|
||
|
elif "u" == read: # 4 hex unicode char
|
||
|
tmp = input_buffer.read(4)
|
||
|
if len(tmp) < 4:
|
||
|
raise LexError("Can't read hex %s." % (str(token),), token.value)
|
||
|
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
|
||
|
elif "U" == read: # 8 hex unicode char
|
||
|
tmp = input_buffer.read(8)
|
||
|
if len(tmp) < 8:
|
||
|
raise LexError("Can't read hex %s." % (str(token),), token.value)
|
||
|
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
|
||
|
else:
|
||
|
raise LexError("Can't parse escped string %s." % (token,), token.value) # TODO: Give line number
|
||
|
else:
|
||
|
result_buffer.write(read)
|
||
|
read = input_buffer.read(1)
|
||
|
|
||
|
# Output result string
|
||
|
token.value = result_buffer.getvalue()
|
||
|
return token
|
||
|
|
||
|
# Build lexer
|
||
|
def __init__(self):
|
||
|
tokens = []
|
||
|
for i in filter(lambda x: x.startswith("t_"), dir(self)):
|
||
|
tokens.append(i[2:])
|
||
|
self.tokens = tuple(tokens)
|
||
|
|
||
|
def t_error(self, token):
|
||
|
raise LexError("Can't lex %s." % (str(token),), token.value)
|
||
|
|
||
|
def build(self):
|
||
|
return lex.lex(module=self)
|
||
|
|
||
|
# Gen lexer
|
||
|
def gen_lexer():
|
||
|
return LexerDef().build()
|