Allow lexing null, boolean, number and string

master
Marko Semet 2020-04-05 01:19:25 +02:00
parent 806a78be4d
commit cb754405b0
10 changed files with 696 additions and 4 deletions

85
doc/syntax.md 100644
View File

@ -0,0 +1,85 @@
# Values
## Null
Null can be used with `null`.
## Booleans
True can be used with `true` and false with `false`.
## Numbers
Integer values can be defined like followed (example with the value 127):
+----------------+--------------------------------------+
| Encoding type: | Code: |
+----------------+--------------------------------------+
| Binary | `0b0111111` or `0B0111111` |
+----------------+--------------------------------------+
| Octal | `0o0177` or `0O177`. Please use the |
| | lower case for a better readability. |
+----------------+--------------------------------------+
| Decimal | `127` |
+----------------+--------------------------------------+
| Hexadecimal | `0x7f`, `0x7F`, `0X7f` or `0X7F`. |
| | Mixing of upper and lowercase is |
| | allowed. |
+----------------+--------------------------------------+
## Strings
A string can be defined by putting single quotes around it like `'text'`. This strings aren't allowed to have line breaks (or single quotes). For string with line breaks use three single quotes like `'''text'''`. Be aware that indenting isn't consider and will be written into the string (when a line breaks appears into the string).
To escape specific string values (like single quotes):
+--------------------+-------------------------------------------+
| Encoding name: | Enconding inside of the code: |
+--------------------+-------------------------------------------+
| Backslash (`\`) | `\\` |
+--------------------+-------------------------------------------+
| Single Quote (`'`) | `\'` |
+--------------------+-------------------------------------------+
| Bell | `\a` |
+--------------------+-------------------------------------------+
| Backspace | `\b` |
+--------------------+-------------------------------------------+
| Formfeed | `\f` |
+--------------------+-------------------------------------------+
| New line | `\n` |
+--------------------+-------------------------------------------+
| Carriage return | `\r` |
+--------------------+-------------------------------------------+
| Horizontal tab | `\t` |
+--------------------+-------------------------------------------+
| Vertical tab | `\v` |
+--------------------+-------------------------------------------+
| Octal 8-Bit | `\YYY` |
| | |
| | `YYY` have to be filled with the |
| | octal representation of the char. |
| | Allowed are three values between `0-7`. |
+--------------------+-------------------------------------------+
| 8-Bit Unicode | `\xYY` or `\XYY` |
| | |
| | `YY` have to be filled with the |
| | hexadecimal representation of the char. |
| | Allowed are two values between `0-9`, |
| | `a-f` and `A-F`. Mixing lower and upper |
| | case is allowed. |
+--------------------+-------------------------------------------+
| 16-Bit Unicode | `\uYYYY` |
| | |
| | `YYYY` have to be filled with the |
| | hexadecimal representation of the char. |
| | Allowed are four values between `0-9`, |
| | `a-f` and `A-F`. Mixing lower and upper |
| | case is allowed. |
+--------------------+-------------------------------------------+
| 32-Bit Unicode | `\UYYYYYYYY` |
| | |
| | `YYYYYYYY` have to be filled with |
| | hexadecimal representation of the char. |
| | Allowed are eight values between `0-9`, |
| | `a-f` and `A-F`. Mixing lower and upper |
| | case is allowed. |
+--------------------+-------------------------------------------+

118
looplang/lexer.py 100644
View File

@ -0,0 +1,118 @@
from ply import lex
from ply.lex import LexError
import io
# Definition
class LexerDef():
# Spacing and comments
t_ignore_COMMENT = r"\#[^\n\r]*"
t_ignore = " \t"
def t_NEWLINES(self,token):
r"\n|\r\n?"
token.lexer.lineno += 1
# Values rules
def t_VALUE_NULL(self, token):
r"null"
token.value = None
return token
def t_VALUE_BOOL(self, token):
r"true|false"
token.value = token.value == "true"
return token
def t_VALUE_NUMBER(self, token):
r"([1-9][0-9]*)|(0([0-9]+|(o|O)[0-7]+|(x|X)[0-9a-fA-F]+|(b|B)[0-1]+)?)"
if token.value.startswith("0") and len(token.value) > 1:
if "0" <= token.value[1] <= "9":
raise LexError("Int can't start with a leading zero %s." % (str(token),), token.value)
elif token.value[1] in ("x", "X"): # Hex
token.value = int(token.value[2:].lower(), 16)
elif token.value[1] in ("o", "O"): # Octet
token.value = int(token.value[2:].lower(), 8)
elif token.value[1] in ("b", "B"): # Binary
token.value = int(token.value[2:].lower(), 2)
else:
raise LexError("Unknown int encoding %s." % (repr(token.value),), token.value)
else: # Default int
token.value = int(token.value)
return token
def __int_to_unicode(self, number:int):
if number < 0 or number >= 2 ** 32:
raise LexError("Can't read unicode char greater then 2^32 or below 0.", "")
return bytes([(number >> 0) % 256, (number >> 8) % 256, (number >> 16) % 256, (number >> 24) % 256]).decode("UTF-32-LE")
_string_replacement = {"\\": "\\",
"'": "'",
"a": "\x07",
"b": "\x08",
"f": "\x0c",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v"}
def t_VALUE_STRING(self, token):
r"'''(\\.|[^\\'])*'''|'(\\.|[^\n\\'])*'"
# Initalize
if token.value.startswith("'''"):
input_buffer = io.StringIO(token.value[3:-3])
else:
input_buffer = io.StringIO(token.value[1:-1])
result_buffer = io.StringIO()
# Make escaping
read = input_buffer.read(1)
while read:
if read == "\\":
read = input_buffer.read(1)
if not read:
raise LexError("Can't parse escped string.", token.value) # TODO: Give line number
elif read in self._string_replacement:
result_buffer.write(self._string_replacement[read])
elif "0" <= read <= "7":
tmp = read + input_buffer.read(2)
if len(tmp) < 2:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 8)))
elif read in ("x", "X"): # 2 hex unicode char
tmp = input_buffer.read(2)
if len(tmp) < 2:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
elif "u" == read: # 4 hex unicode char
tmp = input_buffer.read(4)
if len(tmp) < 4:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
elif "U" == read: # 8 hex unicode char
tmp = input_buffer.read(8)
if len(tmp) < 8:
raise LexError("Can't read hex %s." % (str(token),), token.value)
result_buffer.write(self.__int_to_unicode(int(tmp, 16)))
else:
raise LexError("Can't parse escped string %s." % (token,), token.value) # TODO: Give line number
else:
result_buffer.write(read)
read = input_buffer.read(1)
# Output result string
token.value = result_buffer.getvalue()
return token
# Build lexer
def __init__(self):
tokens = []
for i in filter(lambda x: x.startswith("t_"), dir(self)):
tokens.append(i[2:])
self.tokens = tuple(tokens)
def t_error(self, token):
raise LexError("Can't lex %s." % (str(token),), token.value)
def build(self):
return lex.lex(module=self)
# Gen lexer
def gen_lexer():
return LexerDef().build()

View File

@ -0,0 +1,23 @@
import os
from unittest import main
from . import utils
# Info for big tests
if not utils.RUN_BIG_TESTS:
print("Skip big tests. To run them set environmetn variable RUN_BIG_TESTS.")
# Load sub packages
def _load_subpackages(path, package):
for module in filter(lambda x: not x.startswith("_"), os.listdir(path)):
module_path = os.path.join(path, module)
if os.path.isfile(module_path) and module.endswith(".py") and "." not in module[:-3]:
for iID, i in filter(lambda x: not x[0].startswith("_"), __import__("%s.%s" % (package, module[:-3]), fromlist=(package,)).__dict__.items()):
globals()[iID] = i
elif os.path.isdir(module_path) and "." not in module:
package_name = "%s.%s" % (package, module)
for iID, i in __import__(package_name, fromlist=(package,)).__dict__.items():
globals()[iID] = i
_load_subpackages(module_path, package_name)
_load_subpackages(os.path.split(__file__)[0], "looplang.test")

View File

@ -0,0 +1,82 @@
import unittest
from . import utils
class TestValueNumber(unittest.TestCase):
__test_ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 123, 1234, 12345, 123456, 1234567, 12345678, 123456789, 1234567890]
def test_numbers(self):
result = utils.list_tokens(" ".join(map(str, range(0, 101))) + " 1000000000000000000000")
self.assertEqual(len(result), 102)
for original, i in zip(range(0, 101), result[:-1]):
self.assertEqual(i.type, "VALUE_NUMBER")
self.assertEqual(i.value, original)
self.assertEqual(result[-1].type, "VALUE_NUMBER")
self.assertEqual(result[-1].value, 1000000000000000000000)
def test_all_number_chars(self):
result = utils.list_tokens("1234567890123456789")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, 1234567890123456789)
def test_no_leading_zero(self):
# Test zero
result = utils.list_tokens("0")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, 0)
# Test leading zero
with self.assertRaises(utils.LexError) as context:
utils.list_tokens("0123456789")
self.assertTrue(
"Leading zero of a numer is not allowed %s." % (str(context),))
def test_dez(self):
for i in self.__test_ints:
result = utils.list_tokens(str(i))
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_bin_lower(self):
for i in self.__test_ints:
result = utils.list_tokens(bin(i).lower())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_bin_upper(self):
for i in self.__test_ints:
result = utils.list_tokens(bin(i).upper())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_octal_lower(self):
for i in self.__test_ints:
result = utils.list_tokens(oct(i).lower())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_octal_upper(self):
for i in self.__test_ints:
result = utils.list_tokens(oct(i).upper())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_hex_lower(self):
for i in self.__test_ints:
result = utils.list_tokens(hex(i).lower())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)
def test_hex_upper(self):
for i in self.__test_ints:
result = utils.list_tokens(hex(i).upper())
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NUMBER")
self.assertEqual(result[0].value, i)

View File

@ -0,0 +1,24 @@
import unittest
from . import utils
class TestValueNull(unittest.TestCase):
def test_null(self):
result = utils.list_tokens("null")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_NULL")
self.assertIsNone(result[0].value)
class TestValueBool(unittest.TestCase):
def test_true(self):
result = utils.list_tokens("true")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_BOOL")
self.assertEqual(result[0].value, True)
def test_false(self):
result = utils.list_tokens("false")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_BOOL")
self.assertEqual(result[0].value, False)

View File

@ -0,0 +1,339 @@
import unittest
from . import utils
class TestValueStringSingle(unittest.TestCase):
def test_empty(self):
result = utils.list_tokens(" '' ")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, '')
result = utils.list_tokens(" '''''' ")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, '')
def test_content(self):
result = utils.list_tokens(" 'abc def\\t1234' ")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, 'abc def\t1234')
result = utils.list_tokens(" '''abc def\\t1234''' ")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, 'abc def\t1234')
def test_no_newline(self):
with self.assertRaises(utils.LexError):
utils.list_tokens(" 'a\nb' ")
result = utils.list_tokens(" '''a\nb''' ")
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, 'a\nb')
def test_valid_octals(self):
for i in range(256):
# Prepare
number = oct(i)[2:]
number = "'\\%s%s'" % ("0" * (3 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def test_valid_octals_multiline(self):
for i in range(256):
# Prepare
number = oct(i)[2:]
number = "'''\\%s%s'''" % ("0" * (3 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def test_valid_hex_lower(self):
# Lower x
for i in range(0o1000):
# Prepare
number = hex(i)[2:].lower()
number = "'\\x%s%s'" % ("0" * (2 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper x
for i in range(0o1000):
# Prepare
number = hex(i)[2:].lower()
number = "'\\X%s%s'" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def test_valid_hex_lower_multiline(self):
# Lower x
for i in range(0o1000):
# Prepare
number = hex(i)[2:].lower()
number = "'''\\x%s%s'''" % ("0" * (2 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper x
for i in range(0o1000):
# Prepare
number = hex(i)[2:].lower()
number = "'''\\X%s%s'''" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def test_valid_hex_upper(self):
# Lower x
for i in range(256):
# Prepare
number = hex(i)[2:].upper()
number = "'\\x%s%s'" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper x
for i in range(256):
# Prepare
number = hex(i)[2:].upper()
number = "'\\X%s%s'" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def test_valid_hex_upper_multiline(self):
# Lower x
for i in range(256):
# Prepare
number = hex(i)[2:].upper()
number = "'''\\x%s%s'''" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper x
for i in range(256):
# Prepare
number = hex(i)[2:].upper()
number = "'''\\X%s%s'''" % ("0" * (2 - len(number)), number)
comparison = eval(number.lower()) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
def __filter_codes(self, source):
def filter_func(value):
if 0xd800 <= value < 0xe000:
return False
else:
return True
return filter(filter_func, source)
@utils.big_test
def test_valid_small_unicode(self):
# Upper x
for i in self.__filter_codes(range(2 ** 16)):
# Prepare
number = hex(i)[2:].lower()
number = "'\\u%s%s'" % ("0" * (4 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper u
for i in self.__filter_codes(range(2 ** 16)):
# Prepare
number = hex(i)[2:].upper()
number = "'\\u%s%s'" % ("0" * (4 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
@utils.big_test
def test_valid_small_unicode_multiline(self):
# Upper x
for i in self.__filter_codes(range(2 ** 16)):
# Prepare
number = hex(i)[2:].lower()
number = "'''\\u%s%s'''" % ("0" * (4 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper u
for i in self.__filter_codes(range(2 ** 16)):
# Prepare
number = hex(i)[2:].upper()
number = "'''\\u%s%s'''" % ("0" * (4 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
@utils.big_test
def test_valid_big_unicode(self):
# Upper x
for i in self.__filter_codes(range(0x00110000)):
# Prepare
number = hex(i)[2:].lower()
number = "'\\U%s%s'" % ("0" * (8 - len(number)), number)
try:
comparison = eval(number) # TODO: Better string generation
except:
raise ValueError(number)
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper u
for i in self.__filter_codes(range(0x00110000)):
# Prepare
number = hex(i)[2:].upper()
number = "'\\U%s%s'" % ("0" * (8 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
@utils.big_test
def test_valid_big_unicode_multiline(self):
# Upper x
for i in self.__filter_codes(range(0x00110000)):
# Prepare
number = hex(i)[2:].lower()
number = "'''\\U%s%s'''" % ("0" * (8 - len(number)), number)
try:
comparison = eval(number) # TODO: Better string generation
except:
raise ValueError(number)
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
# Upper u
for i in self.__filter_codes(range(0x00110000)):
# Prepare
number = hex(i)[2:].upper()
number = "'''\\U%s%s'''" % ("0" * (8 - len(number)), number)
comparison = eval(number) # TODO: Better string generation
# Test
result = utils.list_tokens(number)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, comparison)
_special_chars = {"\\": "\\",
"'": "'",
"a": "\x07",
"b": "\x08",
"f": "\x0c",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
"000": "\000",
"100": "\100",
"200": "\200",
"300": "\300",
"400": "\400",
"500": "\500",
"600": "\600",
"700": "\700",
"x00": "\x00",
"X00": "\x00",
"u0000": "\x00",
"U00000000": "\x00"}
def test_special_values(self):
for code, code_result in self._special_chars.items():
result = utils.list_tokens(" '\\%s' " % code)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, code_result)
result = utils.list_tokens(" '''\\%s''' " % code)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].type, "VALUE_STRING")
self.assertEqual(result[0].value, code_result)
def test_not_special_values(self):
ignore = set(map(lambda x: x[0], self._special_chars))
all_chars = set(map(lambda x: bytes([x]).decode(), range(128)))
tmp = all_chars - ignore
for code in tmp:
with self.assertRaises(utils.LexError):
utils.list_tokens(" '\\%s' " % (code,))
with self.assertRaises(utils.LexError):
utils.list_tokens(" '''\\%s''' " % code)

View File

@ -0,0 +1,17 @@
import os
import unittest
from looplang.lexer import gen_lexer, LexError
def list_tokens(code:str):
lexer = gen_lexer()
lexer.input(code)
return list(lexer)
RUN_BIG_TESTS = os.environ.get("RUN_BIG_TESTS", None) is not None
def big_test(func):
if RUN_BIG_TESTS:
return func
else:
return unittest.skip(func)

View File

@ -12,6 +12,6 @@ setup(name="looplang",
author="Marko Semet", author="Marko Semet",
author_email="marko@marko10-000.de", author_email="marko@marko10-000.de",
url="https://marko10-000.de/project/looplang", url="https://marko10-000.de/project/looplang",
packages=find_packages("looplang"), packages=find_packages(),
install_requires=["ply>=3.0<4"] install_requires=["ply>=3.0<4"]
) )

4
test.sh 100755
View File

@ -0,0 +1,4 @@
#! /usr/bin/env bash
cd "$(dirname "$0")" &&
./venv.sh python3 -m unittest looplang.test

View File

@ -3,11 +3,11 @@
cd "$(dirname "$0")" cd "$(dirname "$0")"
python3 -m venv venv && python3 -m venv venv &&
source venv/bin/activate && source venv/bin/activate &&
pip install --upgrade pip && pip install pip &&
pip install --upgrade . && pip install . &&
if [ "$#" -gt 1 ] if [ "$#" -gt 1 ]
then then
exec "$@" exec -- "$@"
else else
exec "$SHELL" exec "$SHELL"
fi fi