looplang/looplang/lexer.py

from ply import lex
from ply.lex import LexError
import io

# Definition
class LexerDef():
    # Spacing and comments
    t_ignore_COMMENT = r"\#[^\n\r]*"
    t_ignore = " \t"

    def t_NEWLINES(self,token):
        r"\n|\r\n?"
        token.lexer.lineno += 1

    # Values rules
    def t_VALUE_NULL(self, token):
        r"null"
        token.value = None
        return token
    def t_VALUE_BOOL(self, token):
        r"true|false"
        token.value = token.value == "true"
        return token
    def t_VALUE_NUMBER(self, token):
        r"([1-9][0-9]*)|(0([0-9]+|(o|O)[0-7]+|(x|X)[0-9a-fA-F]+|(b|B)[0-1]+)?)"
        if token.value.startswith("0") and len(token.value) > 1:
            if "0" <= token.value[1] <= "9":
                raise LexError("Int can't start with a leading zero %s." % (str(token),), token.value)
            elif token.value[1] in ("x", "X"): # Hex
                token.value = int(token.value[2:].lower(), 16)
            elif token.value[1] in ("o", "O"):  # Octet
                token.value = int(token.value[2:].lower(), 8)
            elif token.value[1] in ("b", "B"):  # Binary
                token.value = int(token.value[2:].lower(), 2)
            else:
                raise LexError("Unknown int encoding %s." % (repr(token.value),), token.value)
        else: # Default int
            token.value = int(token.value)
        return token

    def __int_to_unicode(self, number:int):
        if number < 0 or number >= 2 ** 32:
            raise LexError("Can't read unicode char greater then 2^32 or below 0.", "")
        return bytes([(number >> 0) % 256, (number >> 8) % 256, (number >> 16) % 256, (number >> 24) % 256]).decode("UTF-32-LE")
    _string_replacement = {"\\": "\\",
                           "'": "'",
                           "a": "\x07",
                           "b": "\x08",
                           "f": "\x0c",
                           "n": "\n",
                           "r": "\r",
                           "t": "\t",
                           "v": "\v"}
    def t_VALUE_STRING(self, token):
        r"'''(\\.|[^\\'])*'''|'(\\.|[^\n\\'])*'"

        # Initalize
        if token.value.startswith("'''"):
            input_buffer = io.StringIO(token.value[3:-3])
        else:
            input_buffer = io.StringIO(token.value[1:-1])
        result_buffer = io.StringIO()

        # Make escaping
        read = input_buffer.read(1)
        while read:
            if read == "\\":
                read = input_buffer.read(1)
                if not read:
                    raise LexError("Can't parse escped string.", token.value) # TODO: Give line number
                elif read in self._string_replacement:
                    result_buffer.write(self._string_replacement[read])
                elif "0" <= read <= "7":
                    tmp = read + input_buffer.read(2)
                    if len(tmp) < 2:
                        raise LexError("Can't read hex %s." % (str(token),), token.value)
                    result_buffer.write(self.__int_to_unicode(int(tmp, 8)))
                elif read in ("x", "X"): # 2 hex unicode char
                    tmp = input_buffer.read(2)
                    if len(tmp) < 2:
                        raise LexError("Can't read hex %s." % (str(token),), token.value)
                    result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
                elif "u" == read: # 4 hex unicode char
                    tmp = input_buffer.read(4)
                    if len(tmp) < 4:
                        raise LexError("Can't read hex %s." % (str(token),), token.value)
                    result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
                elif "U" == read: # 8 hex unicode char
                    tmp = input_buffer.read(8)
                    if len(tmp) < 8:
                        raise LexError("Can't read hex %s." % (str(token),), token.value)
                    result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
                else:
                    raise LexError("Can't parse escped string %s." % (token,), token.value) # TODO: Give line number
            else:
                result_buffer.write(read)
            read = input_buffer.read(1)

        # Output result string
        token.value = result_buffer.getvalue()
        return token

    # Build lexer
    def __init__(self):
        tokens = []
        for i in filter(lambda x: x.startswith("t_"), dir(self)):
            tokens.append(i[2:])
        self.tokens = tuple(tokens)

    def t_error(self, token):
        raise LexError("Can't lex %s." % (str(token),), token.value)

    def build(self):
        return lex.lex(module=self)

# Gen lexer
def gen_lexer():
    return LexerDef().build()