From 0118cdcb80c8a299eb156ee669463b84fa44f51d Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Sat, 6 Apr 2019 17:42:00 -0300 Subject: Fix \u --- luatokens.lua | 117 ++++++++++++++++++++++++++++++++++++++++++++++++---------- parser.lua | 8 ++-- test.lua | 87 +++++++++++++++++++++++++++++++++---------- 3 files changed, 169 insertions(+), 43 deletions(-) diff --git a/luatokens.lua b/luatokens.lua index 28944c1..58a7d09 100644 --- a/luatokens.lua +++ b/luatokens.lua @@ -1,4 +1,4 @@ --- Lua tokens +-- Lua defs -- we need some stuff from here local parser = require "parser" @@ -8,11 +8,27 @@ local COLLECT = parser.COLLECT local collect_fallback = parser.collect_fallback -- "dummies" -local TK_STRING = {} +-- see http://www.lua.org/source/5.3/llex.h.html#RESERVED +local TK_AND, TK_BREAK, + TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, + TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, + TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, + TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, + TK_SHL, TK_SHR, + TK_DBCOLON, TK_EOS, + TK_FLT, TK_INT, TK_NAME, TK_STRING = + {}, {}, + {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, + {}, {}, + {}, {}, + {}, {}, {}, {} -local tokens = {} +local defs = {} -tokens.base = { +defs.base = { [" "] = "whitespace", ["\n"] = "newline", ["\r"] = "newline", @@ -84,14 +100,15 @@ tokens.base = { } local function linecount(state, token, rule) + -- TODO fix if token == "\n" or token == "\r" then state.line = (state.line or 1) + 1 end end do local tstring = selfify({}) - tokens.string = tstring - tstring.tokens = tokens + defs.string = tstring + tstring.defs = defs do local tsescapes = setmetatable({ ["'"] = "insertraw", ['"'] = "insertraw", @@ -110,9 +127,9 @@ do local tstring = selfify({}) ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}), [1] = linecount, [2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end, - }, {__index = tokens.base}) - tokens.string.escapes = tsescapes - tsescapes.string = tokens.string + }, {__index = defs.base}) + defs.string.escapes = tsescapes + tsescapes.string = defs.string function tsescapes.insertraw(state, token) collect_fallback(state, token) @@ -158,7 +175,7 @@ do local tstring = selfify({}) end end - tsescapes.hex = setmetatable(selfify({string = tokens.string, digit = "hexdigit"}), {__index=tokens.base}) + tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base}) function tsescapes.hex.hexdigit(state, token) local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true) assert(digit, "this should never be called for non-hex-digits") @@ -174,14 +191,60 @@ do local tstring = selfify({}) end end + do local tseunicode = {} + tseunicode["{"] = "hex" + do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base}) + tseunicode.hex = tseuhex + function tseuhex.hexdigit(state, token) + local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true) + assert(digit, "this should never be called for non-hex-digits") + state.in_hex = (state.in_hex or 0) * 16 + digit % 16 + if state.in_hex <= 2147483647 then + return "self" + end + end + tseuhex["}"] = function(state, token) + local num = state.in_hex + state.in_hex = nil + if num < 128 then + collect_fallback(state, string.char(num)) + return "string" + end + local bytes = "" + while num > 63 do + local v = num % 64 + bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient + num = (num - v) / 64 + end + if num >= 2^6/(2^#bytes) then + local v = num % 64 + bytes = string.char(128 + v) .. bytes + num = (num - v) / 64 + end + do + local v = 0 + for i=1,#bytes do + v = v + 128 / 2^i + end + v = v + num + assert(v < 126) + bytes = string.char(128 + v) .. bytes + end + collect_fallback(state, bytes) + return "string" + end + end + tsescapes.unicode = tseunicode + end + do local tseskipwhitespace = selfify({ - string = tokens.string, + string = defs.string, whitespace = "self", [""] = "string", [1] = collect_fallback, [2] = linecount, }) - local tbase = tokens.base + local tbase = defs.base local tbasemap = {whitespace = "whitespace", newline = "whitespace"} setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end}) tsescapes.skipwhitespace = tseskipwhitespace @@ -205,7 +268,7 @@ do local tstring = selfify({}) state.in_string = nil state[#state+1] = table.concat(state[COLLECT]) state[COLLECT] = nil - return "tokens" + return "defs" else collect_fallback(state, token) return "self" @@ -213,13 +276,18 @@ do local tstring = selfify({}) end end -tokens["'"] = "string_open" -tokens['"'] = "string_open" -tokens[1] = linecount +do local tlongstring = {} + -- TODO +end + +defs["'"] = "string_open" +defs['"'] = "string_open" +defs["["] = "maybe_longstring" +defs[1] = linecount -setmetatable(tokens, {__index=whitespace}) +setmetatable(defs, {__index=whitespace}) -function tokens.string_open(state, token) +function defs.string_open(state, token) if not state.in_string then state[#state+1] = TK_STRING state[COLLECT] = {} @@ -230,6 +298,15 @@ function tokens.string_open(state, token) end return { - tokens = tokens, - TK_STRING = TK_STRING, + defs = defs, + tokens = { + TK_AND = TK_AND, TK_BREAK = TK_BREAK, + TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION, + TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT, + TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE, + TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE, + TK_SHL = TK_SHL, TK_SHR = TK_SHR, + TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS, + TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING + }, } diff --git a/parser.lua b/parser.lua index beb944e..0cd2853 100644 --- a/parser.lua +++ b/parser.lua @@ -72,7 +72,7 @@ local function get_next_common(state, in_pos, token) end local function get_next_table(state, in_pos) - if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end + if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling in_pos = in_pos + 1 local token = state[DATA][in_pos - state[OFFDATA]] if token == nil then @@ -84,10 +84,10 @@ local function get_next_table(state, in_pos) end local function get_next_string(state, in_pos) - if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end + if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling in_pos = in_pos + 1 - local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] or "" - if token == "" then + local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] + if token == nil then state[OFFDATA] = in_pos - 1 state[DATA] = state[GEN]() return get_next_string(state, state[OFFDATA]) diff --git a/test.lua b/test.lua index 076d1e8..283b566 100644 --- a/test.lua +++ b/test.lua @@ -56,57 +56,106 @@ do -- trim left spaces end return "self" end - for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do + for k,v in ipairs({"hello", " hello", "\t \v \n\r hello"}) do local state, err = parser.parse(defs, v) + local case = case() if not state then - print(case(), err) + print(case, err) else - print(case(), table.concat(state)) + assert(table.concat(state) == "hello") end end end -- trim left spaces do -- lua tokens local luatokens = require "luatokens" - local tokens = luatokens.tokens + local tokens = luatokens.defs local state, err, etoken, estate = parser.parse(tokens, [["hello world"]]) local case = case() - print(case, "---- IN TOKENS ----") if not state then + print(case, "---- IN TOKENS ----") print(case, err, etoken) for i,v in pairs(estate) do print(case, i, v) end + print(case, "---- OUT TOKENS ----") else - for i,v in ipairs(state) do - print(case, i, v) - end + assert(state[1] == luatokens.tokens.TK_STRING) + assert(state[2] == "hello world") end - print(case, "---- OUT TOKENS ----") end -- lua tokens do -- more lua tokens local luatokens = require "luatokens" - local tokens = luatokens.tokens + local tokens = luatokens.defs local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\ "]]) local case = case() - print(case, "---- IN TOKENS ----") if not state then + print(case, "---- IN TOKENS ----") print(case, err, etoken) for i,v in pairs(estate) do print(case, i, v) end + print(case, "---- OUT TOKENS ----") else - for i,v in ipairs(state) do + assert(state[1] == luatokens.tokens.TK_STRING) + assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10") + end +end -- lua tokens + +do -- even more lua tokens + local luatokens = require "luatokens" + local tokens = luatokens.defs + local state, err, etoken, estate = parser.parse(tokens, [["\u{000000000000000000000000000000000000000000000000000000000000041}"]]) + local case = case() + if not state then + print(case, "---- IN TOKENS ----") + print(case, err, etoken) + for i,v in pairs(estate) do print(case, i, v) - if v == luatokens.TK_STRING then - in_string = true - elseif in_string then - print(case, v:gsub(".", function(v) return "\\"..string.byte(v) end)) - in_string = false - end end + print(case, "---- OUT TOKENS ----") + else + assert(state[1] == luatokens.tokens.TK_STRING) + assert(state[2] == "A") + end +end -- lua tokens + +do -- even more lua tokens + local luatokens = require "luatokens" + local tokens = luatokens.defs + local state, err, etoken, estate = parser.parse(tokens, [["\u{7F}""\u{80}""\u{7FF}""\u{800}""\u{FFFF}""\u{10000}""\u{1FFFFF}""\u{200000}""\u{3FFFFFF}""\u{4000000}""\u{7FFFFFFF}"]]) + local case = case() + if not state then + print(case, "---- IN TOKENS ----") + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + print(case, "---- OUT TOKENS ----") + else + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\127") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\194\128") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\223\191") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\224\160\128") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\239\191\191") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\240\144\128\128") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\247\191\191\191") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\248\136\128\128\128") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\251\191\191\191\191") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\252\132\128\128\128\128") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "\253\191\191\191\191\191") end - print(case, "---- OUT TOKENS ----") end -- lua tokens -- cgit 1.4.1