looplang/looplang/lexer.py

119 lines
4.7 KiB
Python

from ply import lex
from ply.lex import LexError
import io
# Definition
class LexerDef():
# Spacing and comments
t_ignore_COMMENT = r"\#[^\n\r]*"
t_ignore = " \t"
def t_NEWLINES(self,token):
r"\n|\r\n?"
token.lexer.lineno += 1
# Values rules
def t_VALUE_NULL(self, token):
r"null"
token.value = None
return token
def t_VALUE_BOOL(self, token):
r"true|false"
token.value = token.value == "true"
return token
def t_VALUE_NUMBER(self, token):
r"([1-9][0-9]*)|(0([0-9]+|(o|O)[0-7]+|(x|X)[0-9a-fA-F]+|(b|B)[0-1]+)?)"
if token.value.startswith("0") and len(token.value) > 1:
if "0" <= token.value[1] <= "9":
raise LexError("Int can't start with a leading zero %s." % (str(token),), token.value)
elif token.value[1] in ("x", "X"): # Hex
token.value = int(token.value[2:].lower(), 16)
elif token.value[1] in ("o", "O"): # Octet
token.value = int(token.value[2:].lower(), 8)
elif token.value[1] in ("b", "B"): # Binary
token.value = int(token.value[2:].lower(), 2)
else:
raise LexError("Unknown int encoding %s." % (repr(token.value),), token.value)
else: # Default int
token.value = int(token.value)
return token
def __int_to_unicode(self, number:int):
if number < 0 or number >= 2 ** 32:
raise LexError("Can't read unicode char greater then 2^32 or below 0.", "")
return bytes([(number >> 0) % 256, (number >> 8) % 256, (number >> 16) % 256, (number >> 24) % 256]).decode("UTF-32-LE")
_string_replacement = {"\\": "\\",
"'": "'",
"a": "\x07",
"b": "\x08",
"f": "\x0c",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v"}
def t_VALUE_STRING(self, token):
r"'''(\\.|[^\\'])*'''|'(\\.|[^\n\\'])*'"
# Initalize
if token.value.startswith("'''"):
input_buffer = io.StringIO(token.value[3:-3])
else:
input_buffer = io.StringIO(token.value[1:-1])
result_buffer = io.StringIO()
# Make escaping
read = input_buffer.read(1)
while read:
if read == "\\":
read = input_buffer.read(1)
if not read:
raise LexError("Can't parse escped string.", token.value) # TODO: Give line number
elif read in self._string_replacement:
result_buffer.write(self._string_replacement[read])
elif "0" <= read <= "7":
tmp = read + input_buffer.read(2)
if len(tmp) < 2:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 8)))
elif read in ("x", "X"): # 2 hex unicode char
tmp = input_buffer.read(2)
if len(tmp) < 2:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
elif "u" == read: # 4 hex unicode char
tmp = input_buffer.read(4)
if len(tmp) < 4:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
elif "U" == read: # 8 hex unicode char
tmp = input_buffer.read(8)
if len(tmp) < 8:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
else:
raise LexError("Can't parse escped string %s." % (token,), token.value) # TODO: Give line number
else:
result_buffer.write(read)
read = input_buffer.read(1)
# Output result string
token.value = result_buffer.getvalue()
return token
# Build lexer
def __init__(self):
tokens = []
for i in filter(lambda x: x.startswith("t_"), dir(self)):
tokens.append(i[2:])
self.tokens = tuple(tokens)
def t_error(self, token):
raise LexError("Can't lex %s." % (str(token),), token.value)
def build(self):
return lex.lex(module=self)
# Gen lexer
def gen_lexer():
return LexerDef().build()