summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorSoniEx2 <endermoneymod@gmail.com>2019-04-03 17:08:29 -0300
committerSoniEx2 <endermoneymod@gmail.com>2019-04-03 17:08:29 -0300
commit5a4b41bd47d999619b0b51052ae99157ac491a01 (patch)
treec40faf4b4bcba14ef879b985206bed34d61a2dde
parentd03d77d28b812244be66763356f24659da769f05 (diff)
Attempted lua tokenizer didn't work
Publishing anyway because someone might be able to learn from my failure
-rw-r--r--luatokens.lua225
-rw-r--r--parser.lua36
-rw-r--r--test.lua54
3 files changed, 304 insertions, 11 deletions
diff --git a/luatokens.lua b/luatokens.lua
new file mode 100644
index 0000000..81cbc11
--- /dev/null
+++ b/luatokens.lua
@@ -0,0 +1,225 @@
+-- Lua tokens
+
+-- we need some stuff from here
+local parser = require "parser"
+local selfify = parser.selfify
+
+-- "dummies"
+local TK_STRING = {}
+
+local tokens = {}
+
+tokens.base = {
+    [" "] = "whitespace",
+    ["\n"] = "newline",
+    ["\r"] = "newline",
+    ["\v"] = "whitespace",
+    ["\t"] = "whitespace",
+    ["\f"] = "whitespace",
+    ["0"] = "digit",
+    ["1"] = "digit",
+    ["2"] = "digit",
+    ["3"] = "digit",
+    ["4"] = "digit",
+    ["5"] = "digit",
+    ["6"] = "digit",
+    ["7"] = "digit",
+    ["8"] = "digit",
+    ["9"] = "digit",
+    ["a"] = "hexdigit",
+    ["b"] = "hexdigit",
+    ["c"] = "hexdigit",
+    ["d"] = "hexdigit",
+    ["e"] = "hexdigit",
+    ["f"] = "hexdigit",
+    ["A"] = "hexdigit",
+    ["B"] = "hexdigit",
+    ["C"] = "hexdigit",
+    ["D"] = "hexdigit",
+    ["E"] = "hexdigit",
+    ["F"] = "hexdigit",
+    ["g"] = "alpha",
+    ["h"] = "alpha",
+    ["i"] = "alpha",
+    ["j"] = "alpha",
+    ["k"] = "alpha",
+    ["l"] = "alpha",
+    ["m"] = "alpha",
+    ["n"] = "alpha",
+    ["o"] = "alpha",
+    ["p"] = "alpha",
+    ["q"] = "alpha",
+    ["r"] = "alpha",
+    ["s"] = "alpha",
+    ["t"] = "alpha",
+    ["u"] = "alpha",
+    ["v"] = "alpha",
+    ["w"] = "alpha",
+    ["x"] = "alpha",
+    ["y"] = "alpha",
+    ["z"] = "alpha",
+    ["G"] = "alpha",
+    ["H"] = "alpha",
+    ["I"] = "alpha",
+    ["J"] = "alpha",
+    ["K"] = "alpha",
+    ["L"] = "alpha",
+    ["M"] = "alpha",
+    ["N"] = "alpha",
+    ["O"] = "alpha",
+    ["P"] = "alpha",
+    ["Q"] = "alpha",
+    ["R"] = "alpha",
+    ["S"] = "alpha",
+    ["T"] = "alpha",
+    ["U"] = "alpha",
+    ["V"] = "alpha",
+    ["W"] = "alpha",
+    ["X"] = "alpha",
+    ["Y"] = "alpha",
+    ["Z"] = "alpha",
+}
+
+local function linecount(state, token, rule)
+    if token == "\n" or token == "\r" then
+        state.line = (state.line or 1) + 1
+    end
+end
+
+do local tstring = selfify({})
+    tokens.string = tstring
+    tstring.tokens = tokens
+    do local tsescapes = setmetatable({
+            ["'"] = "insertraw",
+            ['"'] = "insertraw",
+            ['\\'] = "insertraw",
+            ["a"] = "insertmap",
+            ["b"] = "insertmap",
+            ["f"] = "insertmap",
+            ["n"] = "insertmap",
+            ["r"] = "insertmap",
+            ["t"] = "insertmap",
+            ["v"] = "insertmap",
+            ["z"] = "skipwhitespace",
+            ["u"] = "unicode",
+            ["x"] = "hex",
+            ["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
+            ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
+            [1] = linecount,
+            [2] = print
+        }, {__index = tokens.base})
+        tokens.string.escapes = tsescapes
+        tsescapes.string = tokens.string
+
+        function tsescapes.insertraw(state, token)
+            state[#state+1] = token
+            return "string"
+        end
+
+        do
+            local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" }
+            function tsescapes.insertmap(state, token)
+                state[#state+1] = map[token]
+                return "string"
+            end
+        end
+
+        function tsescapes.digit(state, token)
+            local digit = string.find("1234567890", token, 1, true)
+            local num = state.in_digit
+            if digit then
+                num = (num or 0) * 10 + digit % 10
+                state.c = (state.c or 0) + 1
+                if state.c < 3 then
+                    state.in_digit = num
+                    return "digitc"
+                end
+            end
+            if num > 255 then
+                return nil
+            end
+            state[#state+1] = string.char(num)
+            state.in_digit = nil
+            state.c = nil
+            return "string"
+        end
+        tsescapes.digitc = setmetatable(selfify({[""] = tsescapes.digit, digitc = "self", string = tstring}), {__index=tstring})
+
+        tsescapes.hex = setmetatable(selfify({string = tokens.string}), {__index=tokens.base})
+        function tsescapes.hex.hexdigit(state, token)
+            local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
+            assert(digit, "this should never be called for non-hex-digits")
+            local num = state.in_hex
+            if num then
+                num = num * 16 + digit % 16
+                state[#state+1] = string.char(num)
+                state.in_hex = nil
+                return "string"
+            else
+                state.in_hex = digit % 16
+                return "self"
+            end
+        end
+
+        do local tseskipwhitespace = selfify({
+                string = tokens.string,
+                whitespace = "self",
+                [""] = "string",
+                [1] = parser.insert_fallback,
+                [2] = linecount,
+            })
+            local tbase = tokens.base
+            local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
+            setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
+            tsescapes.skipwhitespace =  tseskipwhitespace
+        end
+    end
+
+    tstring['\\'] = "escapes"
+
+    tstring['"'] = "close"
+    tstring["'"] = "close"
+
+    tstring['\n'] = false
+    tstring['\r'] = false
+
+    tstring[""] = "self"
+
+    tstring[1] = parser.insert_fallback
+
+    function tstring.close(state, token)
+        if state.in_string == token then
+            local i = state.string_start
+            state.in_string = nil
+            state.string_start = nil
+            state[i+1] = table.concat(state, '', i+1)
+            for j=i+2, #state do
+                state[j]=nil
+            end
+            return "tokens"
+        else
+            state[#state+1] = token
+            return "self"
+        end
+    end
+end
+
+tokens["'"] = "string_open"
+tokens['"'] = "string_open"
+
+setmetatable(tokens, {__index=whitespace})
+
+function tokens.string_open(state, token)
+    if not state.in_string then
+        state[#state+1] = TK_STRING
+        state.in_string = token
+        state.string_start = #state
+        return "string"
+    end
+    assert("this shouldn't happen")
+end
+
+return {
+    tokens = tokens,
+    TK_STRING = TK_STRING,
+}
diff --git a/parser.lua b/parser.lua
index 479d80a..ece8a8f 100644
--- a/parser.lua
+++ b/parser.lua
@@ -31,27 +31,34 @@ local type, tostring
 local function get_next_common(state, in_pos, token)
     -- note: must preserve "token" - do not call recursively with a different token
     local transition
-    if state[STATE] ~= nil then
-        transition = state[STATE][token]
-        if not transition then
-            transition = state[STATE][""]
+    if state[STATE] then
+        local st = state[STATE]
+        local rule = st[token]
+        transition = rule
+        if transition == nil then
+            transition = st[""]
         end
         local recheck = true
         while recheck do
             recheck = false
             local tytrans = type(transition)
             if tytrans == "string" then
-                transition = state[STATE][transition]
+                transition = st[transition]
                 recheck = true
             elseif tytrans == "function" then
                 transition = transition(state, token)
                 recheck = true
             end
         end
-        state[STATE] = transition -- may be nil
+        for i, hook in ipairs(st) do
+            if hook then -- allow overriding/disabling hooks
+                hook(state, token, rule)
+            end
+        end
+        state[STATE] = transition -- may be nil or false
     end
-    -- must NOT use elseif here - the above may set state to nil!
-    if state[STATE] == nil then
+    -- must NOT use elseif here - the above may set state to nil or false!
+    if not state[STATE] then
         -- unexpected token. stream consumer may attempt to recover,
         -- but we do this mostly to differentiate it from "end of stream" condition.
         return in_pos - 1, nil, "unexpected token", token, state
@@ -112,8 +119,21 @@ local function parse(defs, data)
     end
 end
 
+-- utility function that's quite common
+local function selfify(t)
+    t.self = t
+    return t
+end
+-- common hook
+local function insert_fallback(state, token, rule)
+    if not rule then
+        state[#state+1] = token
+    end
+end
+
 return {
     STATE = STATE,
     stream = stream,
     parse = parse,
+    selfify = selfify,
 }
diff --git a/test.lua b/test.lua
index 606d60e..1290c97 100644
--- a/test.lua
+++ b/test.lua
@@ -18,6 +18,12 @@
 
 local parser = require "parser"
 
+local caseno = 0
+local function case()
+    caseno = caseno + 1
+    return caseno
+end
+
 do -- trim left spaces
     local defs = {}
     defs.self = defs
@@ -52,7 +58,49 @@ do -- trim left spaces
     end
     for k,v in ipairs({"hello", "    hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
         local state, err = parser.parse(defs, v)
-        if not state then print(err) end
-        print(table.concat(state))
+        if not state then
+            print(case(), err)
+        else
+            print(case(), table.concat(state))
+        end
     end
-end
+end -- trim left spaces
+
+do -- lua tokens
+    local luatokens = require "luatokens"
+    local tokens = luatokens.tokens
+    local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
+    local case = case()
+    print(case, "---- IN  TOKENS ----")
+    if not state then
+        print(case, err, etoken)
+        for i,v in pairs(estate) do
+            print(case, i, v)
+        end
+    else
+        for i,v in ipairs(state) do
+            print(case, i, v)
+        end
+    end
+    print(case, "---- OUT TOKENS ----")
+end -- lua tokens
+
+do -- more lua tokens
+    local luatokens = require "luatokens"
+    local tokens = luatokens.tokens
+    local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z        \x41\65\
+"]])
+    local case = case()
+    print(case, "---- IN  TOKENS ----")
+    if not state then
+        print(case, err, etoken)
+        for i,v in pairs(estate) do
+            print(case, i, v)
+        end
+    else
+        for i,v in ipairs(state) do
+            print(case, i, v)
+        end
+    end
+    print(case, "---- OUT TOKENS ----")
+end -- lua tokens