summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorSoniEx2 <endermoneymod@gmail.com>2019-04-06 17:42:00 -0300
committerSoniEx2 <endermoneymod@gmail.com>2019-04-06 17:42:00 -0300
commit0118cdcb80c8a299eb156ee669463b84fa44f51d (patch)
tree06beff5b1c5a437ca14a14d9b9d7661614e7abb7
parent1f1f6c0732ddb22c7d102a810b6c24724a5b6d3b (diff)
Fix \u
-rw-r--r--luatokens.lua117
-rw-r--r--parser.lua8
-rw-r--r--test.lua87
3 files changed, 169 insertions, 43 deletions
diff --git a/luatokens.lua b/luatokens.lua
index 28944c1..58a7d09 100644
--- a/luatokens.lua
+++ b/luatokens.lua
@@ -1,4 +1,4 @@
--- Lua tokens
+-- Lua defs
 
 -- we need some stuff from here
 local parser = require "parser"
@@ -8,11 +8,27 @@ local COLLECT = parser.COLLECT
 local collect_fallback = parser.collect_fallback
 
 -- "dummies"
-local TK_STRING = {}
+-- see http://www.lua.org/source/5.3/llex.h.html#RESERVED
+local TK_AND, TK_BREAK,
+    TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
+    TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
+    TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
+    TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE,
+    TK_SHL, TK_SHR,
+    TK_DBCOLON, TK_EOS,
+    TK_FLT, TK_INT, TK_NAME, TK_STRING =
+    {}, {},
+    {}, {}, {}, {}, {}, {}, {},
+    {}, {}, {}, {}, {}, {}, {}, {},
+    {}, {}, {}, {}, {},
+    {}, {}, {}, {}, {}, {}, {},
+    {}, {},
+    {}, {},
+    {}, {}, {}, {}
 
-local tokens = {}
+local defs = {}
 
-tokens.base = {
+defs.base = {
     [" "] = "whitespace",
     ["\n"] = "newline",
     ["\r"] = "newline",
@@ -84,14 +100,15 @@ tokens.base = {
 }
 
 local function linecount(state, token, rule)
+    -- TODO fix
     if token == "\n" or token == "\r" then
         state.line = (state.line or 1) + 1
     end
 end
 
 do local tstring = selfify({})
-    tokens.string = tstring
-    tstring.tokens = tokens
+    defs.string = tstring
+    tstring.defs = defs
     do local tsescapes = setmetatable({
             ["'"] = "insertraw",
             ['"'] = "insertraw",
@@ -110,9 +127,9 @@ do local tstring = selfify({})
             ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
             [1] = linecount,
             [2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end,
-        }, {__index = tokens.base})
-        tokens.string.escapes = tsescapes
-        tsescapes.string = tokens.string
+        }, {__index = defs.base})
+        defs.string.escapes = tsescapes
+        tsescapes.string = defs.string
 
         function tsescapes.insertraw(state, token)
             collect_fallback(state, token)
@@ -158,7 +175,7 @@ do local tstring = selfify({})
             end
         end
 
-        tsescapes.hex = setmetatable(selfify({string = tokens.string, digit = "hexdigit"}), {__index=tokens.base})
+        tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base})
         function tsescapes.hex.hexdigit(state, token)
             local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
             assert(digit, "this should never be called for non-hex-digits")
@@ -174,14 +191,60 @@ do local tstring = selfify({})
             end
         end
 
+        do local tseunicode = {}
+            tseunicode["{"] = "hex"
+            do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base})
+                tseunicode.hex = tseuhex
+                function tseuhex.hexdigit(state, token)
+                    local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
+                    assert(digit, "this should never be called for non-hex-digits")
+                    state.in_hex = (state.in_hex or 0) * 16 + digit % 16
+                    if state.in_hex <= 2147483647 then
+                        return "self"
+                    end
+                end
+                tseuhex["}"] = function(state, token)
+                    local num = state.in_hex
+                    state.in_hex = nil
+                    if num < 128 then
+                        collect_fallback(state, string.char(num))
+                        return "string"
+                    end
+                    local bytes = ""
+                    while num > 63 do
+                        local v = num % 64
+                        bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient
+                        num = (num - v) / 64
+                    end
+                    if num >= 2^6/(2^#bytes) then
+                        local v = num % 64
+                        bytes = string.char(128 + v) .. bytes
+                        num = (num - v) / 64
+                    end
+                    do
+                        local v = 0
+                        for i=1,#bytes do
+                            v = v + 128 / 2^i
+                        end
+                        v = v + num
+                        assert(v < 126)
+                        bytes = string.char(128 + v) .. bytes
+                    end
+                    collect_fallback(state, bytes)
+                    return "string"
+                end
+            end
+            tsescapes.unicode = tseunicode
+        end
+
         do local tseskipwhitespace = selfify({
-                string = tokens.string,
+                string = defs.string,
                 whitespace = "self",
                 [""] = "string",
                 [1] = collect_fallback,
                 [2] = linecount,
             })
-            local tbase = tokens.base
+            local tbase = defs.base
             local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
             setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
             tsescapes.skipwhitespace =  tseskipwhitespace
@@ -205,7 +268,7 @@ do local tstring = selfify({})
             state.in_string = nil
             state[#state+1] = table.concat(state[COLLECT])
             state[COLLECT] = nil
-            return "tokens"
+            return "defs"
         else
             collect_fallback(state, token)
             return "self"
@@ -213,13 +276,18 @@ do local tstring = selfify({})
     end
 end
 
-tokens["'"] = "string_open"
-tokens['"'] = "string_open"
-tokens[1] = linecount
+do local tlongstring = {}
+    -- TODO
+end
+
+defs["'"] = "string_open"
+defs['"'] = "string_open"
+defs["["] = "maybe_longstring"
+defs[1] = linecount
 
-setmetatable(tokens, {__index=whitespace})
+setmetatable(defs, {__index=whitespace})
 
-function tokens.string_open(state, token)
+function defs.string_open(state, token)
     if not state.in_string then
         state[#state+1] = TK_STRING
         state[COLLECT] = {}
@@ -230,6 +298,15 @@ function tokens.string_open(state, token)
 end
 
 return {
-    tokens = tokens,
-    TK_STRING = TK_STRING,
+    defs = defs,
+    tokens = {
+        TK_AND = TK_AND, TK_BREAK = TK_BREAK,
+        TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION,
+        TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT,
+        TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE,
+        TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE,
+        TK_SHL = TK_SHL, TK_SHR = TK_SHR,
+        TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS,
+        TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING
+    },
 }
diff --git a/parser.lua b/parser.lua
index beb944e..0cd2853 100644
--- a/parser.lua
+++ b/parser.lua
@@ -72,7 +72,7 @@ local function get_next_common(state, in_pos, token)
 end
 
 local function get_next_table(state, in_pos)
-    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
+    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
     in_pos = in_pos + 1
     local token = state[DATA][in_pos - state[OFFDATA]]
     if token == nil then
@@ -84,10 +84,10 @@ local function get_next_table(state, in_pos)
 end
 
 local function get_next_string(state, in_pos)
-    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
+    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
     in_pos = in_pos + 1
-    local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] or ""
-    if token == "" then
+    local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])]
+    if token == nil then
         state[OFFDATA] = in_pos - 1
         state[DATA] = state[GEN]()
         return get_next_string(state, state[OFFDATA])
diff --git a/test.lua b/test.lua
index 076d1e8..283b566 100644
--- a/test.lua
+++ b/test.lua
@@ -56,57 +56,106 @@ do -- trim left spaces
         end
         return "self"
     end
-    for k,v in ipairs({"hello", "    hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
+    for k,v in ipairs({"hello", "    hello", "\t \v \n\r hello"}) do
         local state, err = parser.parse(defs, v)
+        local case = case()
         if not state then
-            print(case(), err)
+            print(case, err)
         else
-            print(case(), table.concat(state))
+            assert(table.concat(state) == "hello")
         end
     end
 end -- trim left spaces
 
 do -- lua tokens
     local luatokens = require "luatokens"
-    local tokens = luatokens.tokens
+    local tokens = luatokens.defs
     local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
     local case = case()
-    print(case, "---- IN  TOKENS ----")
     if not state then
+        print(case, "---- IN  TOKENS ----")
         print(case, err, etoken)
         for i,v in pairs(estate) do
             print(case, i, v)
         end
+        print(case, "---- OUT TOKENS ----")
     else
-        for i,v in ipairs(state) do
-            print(case, i, v)
-        end
+        assert(state[1] == luatokens.tokens.TK_STRING)
+        assert(state[2] == "hello world")
     end
-    print(case, "---- OUT TOKENS ----")
 end -- lua tokens
 
 do -- more lua tokens
     local luatokens = require "luatokens"
-    local tokens = luatokens.tokens
+    local tokens = luatokens.defs
     local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z        \x41\65\
 "]])
     local case = case()
-    print(case, "---- IN  TOKENS ----")
     if not state then
+        print(case, "---- IN  TOKENS ----")
         print(case, err, etoken)
         for i,v in pairs(estate) do
             print(case, i, v)
         end
+        print(case, "---- OUT TOKENS ----")
     else
-        for i,v in ipairs(state) do
+        assert(state[1] == luatokens.tokens.TK_STRING)
+        assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10")
+    end
+end -- lua tokens
+
+do -- even more lua tokens
+    local luatokens = require "luatokens"
+    local tokens = luatokens.defs
+    local state, err, etoken, estate = parser.parse(tokens, [["\u{000000000000000000000000000000000000000000000000000000000000041}"]])
+    local case = case()
+    if not state then
+        print(case, "---- IN  TOKENS ----")
+        print(case, err, etoken)
+        for i,v in pairs(estate) do
             print(case, i, v)
-            if v == luatokens.TK_STRING then
-                in_string = true
-            elseif in_string then
-                print(case, v:gsub(".", function(v) return "\\"..string.byte(v) end))
-                in_string = false
-            end
         end
+        print(case, "---- OUT TOKENS ----")
+    else
+        assert(state[1] == luatokens.tokens.TK_STRING)
+        assert(state[2] == "A")
+    end
+end -- lua tokens
+
+do -- even more lua tokens
+    local luatokens = require "luatokens"
+    local tokens = luatokens.defs
+    local state, err, etoken, estate = parser.parse(tokens, [["\u{7F}""\u{80}""\u{7FF}""\u{800}""\u{FFFF}""\u{10000}""\u{1FFFFF}""\u{200000}""\u{3FFFFFF}""\u{4000000}""\u{7FFFFFFF}"]])
+    local case = case()
+    if not state then
+        print(case, "---- IN  TOKENS ----")
+        print(case, err, etoken)
+        for i,v in pairs(estate) do
+            print(case, i, v)
+        end
+        print(case, "---- OUT TOKENS ----")
+    else
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\127")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\194\128")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\223\191")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\224\160\128")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\239\191\191")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\240\144\128\128")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\247\191\191\191")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\248\136\128\128\128")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\251\191\191\191\191")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\252\132\128\128\128\128")
+        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
+        assert(table.remove(state, 1) == "\253\191\191\191\191\191")
     end
-    print(case, "---- OUT TOKENS ----")
 end -- lua tokens