From 5a4b41bd47d999619b0b51052ae99157ac491a01 Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Wed, 3 Apr 2019 17:08:29 -0300 Subject: Attempted lua tokenizer didn't work Publishing anyway because someone might be able to learn from my failure --- luatokens.lua | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ parser.lua | 36 +++++++--- test.lua | 54 +++++++++++++- 3 files changed, 304 insertions(+), 11 deletions(-) create mode 100644 luatokens.lua diff --git a/luatokens.lua b/luatokens.lua new file mode 100644 index 0000000..81cbc11 --- /dev/null +++ b/luatokens.lua @@ -0,0 +1,225 @@ +-- Lua tokens + +-- we need some stuff from here +local parser = require "parser" +local selfify = parser.selfify + +-- "dummies" +local TK_STRING = {} + +local tokens = {} + +tokens.base = { + [" "] = "whitespace", + ["\n"] = "newline", + ["\r"] = "newline", + ["\v"] = "whitespace", + ["\t"] = "whitespace", + ["\f"] = "whitespace", + ["0"] = "digit", + ["1"] = "digit", + ["2"] = "digit", + ["3"] = "digit", + ["4"] = "digit", + ["5"] = "digit", + ["6"] = "digit", + ["7"] = "digit", + ["8"] = "digit", + ["9"] = "digit", + ["a"] = "hexdigit", + ["b"] = "hexdigit", + ["c"] = "hexdigit", + ["d"] = "hexdigit", + ["e"] = "hexdigit", + ["f"] = "hexdigit", + ["A"] = "hexdigit", + ["B"] = "hexdigit", + ["C"] = "hexdigit", + ["D"] = "hexdigit", + ["E"] = "hexdigit", + ["F"] = "hexdigit", + ["g"] = "alpha", + ["h"] = "alpha", + ["i"] = "alpha", + ["j"] = "alpha", + ["k"] = "alpha", + ["l"] = "alpha", + ["m"] = "alpha", + ["n"] = "alpha", + ["o"] = "alpha", + ["p"] = "alpha", + ["q"] = "alpha", + ["r"] = "alpha", + ["s"] = "alpha", + ["t"] = "alpha", + ["u"] = "alpha", + ["v"] = "alpha", + ["w"] = "alpha", + ["x"] = "alpha", + ["y"] = "alpha", + ["z"] = "alpha", + ["G"] = "alpha", + ["H"] = "alpha", + ["I"] = "alpha", + ["J"] = "alpha", + ["K"] = "alpha", + ["L"] = "alpha", + ["M"] = "alpha", + ["N"] = "alpha", + ["O"] = "alpha", + ["P"] = "alpha", + ["Q"] = "alpha", + ["R"] = "alpha", + ["S"] = "alpha", + ["T"] = "alpha", + ["U"] = "alpha", + ["V"] = "alpha", + ["W"] = "alpha", + ["X"] = "alpha", + ["Y"] = "alpha", + ["Z"] = "alpha", +} + +local function linecount(state, token, rule) + if token == "\n" or token == "\r" then + state.line = (state.line or 1) + 1 + end +end + +do local tstring = selfify({}) + tokens.string = tstring + tstring.tokens = tokens + do local tsescapes = setmetatable({ + ["'"] = "insertraw", + ['"'] = "insertraw", + ['\\'] = "insertraw", + ["a"] = "insertmap", + ["b"] = "insertmap", + ["f"] = "insertmap", + ["n"] = "insertmap", + ["r"] = "insertmap", + ["t"] = "insertmap", + ["v"] = "insertmap", + ["z"] = "skipwhitespace", + ["u"] = "unicode", + ["x"] = "hex", + ["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}), + ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}), + [1] = linecount, + [2] = print + }, {__index = tokens.base}) + tokens.string.escapes = tsescapes + tsescapes.string = tokens.string + + function tsescapes.insertraw(state, token) + state[#state+1] = token + return "string" + end + + do + local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" } + function tsescapes.insertmap(state, token) + state[#state+1] = map[token] + return "string" + end + end + + function tsescapes.digit(state, token) + local digit = string.find("1234567890", token, 1, true) + local num = state.in_digit + if digit then + num = (num or 0) * 10 + digit % 10 + state.c = (state.c or 0) + 1 + if state.c < 3 then + state.in_digit = num + return "digitc" + end + end + if num > 255 then + return nil + end + state[#state+1] = string.char(num) + state.in_digit = nil + state.c = nil + return "string" + end + tsescapes.digitc = setmetatable(selfify({[""] = tsescapes.digit, digitc = "self", string = tstring}), {__index=tstring}) + + tsescapes.hex = setmetatable(selfify({string = tokens.string}), {__index=tokens.base}) + function tsescapes.hex.hexdigit(state, token) + local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true) + assert(digit, "this should never be called for non-hex-digits") + local num = state.in_hex + if num then + num = num * 16 + digit % 16 + state[#state+1] = string.char(num) + state.in_hex = nil + return "string" + else + state.in_hex = digit % 16 + return "self" + end + end + + do local tseskipwhitespace = selfify({ + string = tokens.string, + whitespace = "self", + [""] = "string", + [1] = parser.insert_fallback, + [2] = linecount, + }) + local tbase = tokens.base + local tbasemap = {whitespace = "whitespace", newline = "whitespace"} + setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end}) + tsescapes.skipwhitespace = tseskipwhitespace + end + end + + tstring['\\'] = "escapes" + + tstring['"'] = "close" + tstring["'"] = "close" + + tstring['\n'] = false + tstring['\r'] = false + + tstring[""] = "self" + + tstring[1] = parser.insert_fallback + + function tstring.close(state, token) + if state.in_string == token then + local i = state.string_start + state.in_string = nil + state.string_start = nil + state[i+1] = table.concat(state, '', i+1) + for j=i+2, #state do + state[j]=nil + end + return "tokens" + else + state[#state+1] = token + return "self" + end + end +end + +tokens["'"] = "string_open" +tokens['"'] = "string_open" + +setmetatable(tokens, {__index=whitespace}) + +function tokens.string_open(state, token) + if not state.in_string then + state[#state+1] = TK_STRING + state.in_string = token + state.string_start = #state + return "string" + end + assert("this shouldn't happen") +end + +return { + tokens = tokens, + TK_STRING = TK_STRING, +} diff --git a/parser.lua b/parser.lua index 479d80a..ece8a8f 100644 --- a/parser.lua +++ b/parser.lua @@ -31,27 +31,34 @@ local type, tostring local function get_next_common(state, in_pos, token) -- note: must preserve "token" - do not call recursively with a different token local transition - if state[STATE] ~= nil then - transition = state[STATE][token] - if not transition then - transition = state[STATE][""] + if state[STATE] then + local st = state[STATE] + local rule = st[token] + transition = rule + if transition == nil then + transition = st[""] end local recheck = true while recheck do recheck = false local tytrans = type(transition) if tytrans == "string" then - transition = state[STATE][transition] + transition = st[transition] recheck = true elseif tytrans == "function" then transition = transition(state, token) recheck = true end end - state[STATE] = transition -- may be nil + for i, hook in ipairs(st) do + if hook then -- allow overriding/disabling hooks + hook(state, token, rule) + end + end + state[STATE] = transition -- may be nil or false end - -- must NOT use elseif here - the above may set state to nil! - if state[STATE] == nil then + -- must NOT use elseif here - the above may set state to nil or false! + if not state[STATE] then -- unexpected token. stream consumer may attempt to recover, -- but we do this mostly to differentiate it from "end of stream" condition. return in_pos - 1, nil, "unexpected token", token, state @@ -112,8 +119,21 @@ local function parse(defs, data) end end +-- utility function that's quite common +local function selfify(t) + t.self = t + return t +end +-- common hook +local function insert_fallback(state, token, rule) + if not rule then + state[#state+1] = token + end +end + return { STATE = STATE, stream = stream, parse = parse, + selfify = selfify, } diff --git a/test.lua b/test.lua index 606d60e..1290c97 100644 --- a/test.lua +++ b/test.lua @@ -18,6 +18,12 @@ local parser = require "parser" +local caseno = 0 +local function case() + caseno = caseno + 1 + return caseno +end + do -- trim left spaces local defs = {} defs.self = defs @@ -52,7 +58,49 @@ do -- trim left spaces end for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do local state, err = parser.parse(defs, v) - if not state then print(err) end - print(table.concat(state)) + if not state then + print(case(), err) + else + print(case(), table.concat(state)) + end end -end +end -- trim left spaces + +do -- lua tokens + local luatokens = require "luatokens" + local tokens = luatokens.tokens + local state, err, etoken, estate = parser.parse(tokens, [["hello world"]]) + local case = case() + print(case, "---- IN TOKENS ----") + if not state then + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + else + for i,v in ipairs(state) do + print(case, i, v) + end + end + print(case, "---- OUT TOKENS ----") +end -- lua tokens + +do -- more lua tokens + local luatokens = require "luatokens" + local tokens = luatokens.tokens + local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\ +"]]) + local case = case() + print(case, "---- IN TOKENS ----") + if not state then + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + else + for i,v in ipairs(state) do + print(case, i, v) + end + end + print(case, "---- OUT TOKENS ----") +end -- lua tokens -- cgit 1.4.1