# A Boneless Datastructure Language
# Copyright (C) 2019 Soni L.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""A Boneless Datastructure Language, version 2.0.0.
ABDL expressions are regex-like constructs for matching and validating object structures. They can be used
with JSON and similar formats, and even self-referential data structures.
Language Reference:
ABDL expressions have the ability to iterate, index, validate and filter data structures. This is
done with the use of the syntax elements listed below.
Syntax Elements:
An ABDL expression is a sequence of zero or more sequences starting with arrows followed by zero or
more subvalues.
An arrow is ``->`` and indicates indexing/iteration (Mappings, Sequences, Sets). It must be followed
by a variable, literal, parameter, regex or key match.
A variable is a string of alphanumeric characters, not starting with a digit. It may be followed by a
literal, parameter, regex, key match, or one or more type tests. A ``(key, value)`` tuple containing
the corresponding matched element will be identified by this name in the results dict.
A literal is a string delimited by single quotes (use ``%'`` to escape ``'`` and ``%%`` to escape ``%``).
A literal can be made "non-validating" by appending an ``?`` after it. It may be followed by one or more
type tests. It is exactly equivalent to indexing an object with a string key.
A parameter is the symbol ``$`` followed by a string of alphanumeric characters, not starting with
a digit. A parameter can be made "non-validating" by appending an ``?`` after it. It may be followed by
one or more type tests. It is exactly equivalent to indexing an object with an arbitrary object key.
A regex is an RE, as defined by the ``re`` module, delimited by forward slashes (use ``%/`` to escape
``/`` and ``%%`` to escape ``%``). A regex can be made "non-validating" by appending an ``?`` after it.
It may be followed by one or more type tests. It attempts to match each key in the object.
A type test is ``:`` followed by a parameter. A type test can be made "non-validating" by appending
an ``?`` after the ``:``. It attempts to match the type of each matched value in the object.
A key match is an ABDL expression enclosed in ``[`` and ``]``, optionally prefixed with one or more type
tests. This matches keys (including the type tests).
A subvalue is an ABDL expression enclosed in ``(`` and ``)``. This allows matching multiple values on
the same object.
Some syntax elements can be validating or non-validating. Validating syntax elements will raise a
:py:exc:`abdl.ValidationError` whenever a non-matching element is encountered, whereas non-validating
ones will skip them. Note that it is possible for a validating syntax element to still yield results
before raising a :py:exc:`abdl.ValidationError`, so one needs to be careful when writing code where such
behaviour could result in a security vulnerability.
Examples:
>>> import abdl
>>> for m in abdl.match("->X:?$dict->Y", {"foo": 1, "bar": {"baz": 2}}, {'dict': dict}):
... print(m['X'][0], m['Y'][0], m['Y'][1])
bar baz 2
>>> pat = abdl.compile('''-> 'projects'?
... -> commit /[0-9a-fA-F]{40}|[0-9a-fA-F]{64}/? :?$dict
... -> url :?$dict
... -> branch :?$dict''', {'dict': dict})
>>> data = {"projects": {
... "385e734a52e13949a7a5c71827f6de920dbfea43": {
... "https://soniex2.autistic.space/git-repos/ganarchy.git": {"HEAD": {"active": True}}
... }
... }}
>>> for m in pat.match(data):
... print(m['commit'][0], m['url'][0], m['branch'][0], m['branch'][1])
385e734a52e13949a7a5c71827f6de920dbfea43 https://soniex2.autistic.space/git-repos/ganarchy.git HEAD {'active': True}
(If ``:?$dict`` wasn't present, a TypeError would be raised when trying to iterate the ``1`` from ``"foo": 1``.)
"""
import re
from collections.abc import Mapping, Sequence, Iterator, Set
class DeprecationError(Exception):
"""Raised for deprecated features, if they are disabled.
This class controls warning/error behaviour of deprecated features."""
#enable_key_match_compat = False
#warn_key_match_compat = False
@classmethod
def warn_all(cls):
"""Enables all deprecation warnings."""
pass
class PatternError(Exception):
"""Raised for invalid input or output expressions."""
# TODO implement formatting
def __init__(self, msg, pattern, defs, pos, toks):
self.msg = msg
self.pattern = pattern
self.defs = defs
self.pos = pos
self._toks = toks # internal use
def _normalize(self, pattern, defs):
if pattern is not None:
if self.pattern is not None:
raise ValueError("Attempt to normalize normalized pattern")
else:
self.pattern = pattern
if defs is not None:
if self.defs is not None:
raise ValueError("Attempt to normalize normalized defs")
else:
self.defs = defs
@classmethod
def _str_escape(cls, s, pos, toks):
raise cls("Error in string escape", None, None, pos, toks)
@classmethod
def _str_end(cls, s, pos, toks):
raise cls("Unfinished string", None, None, pos, toks)
@classmethod
def _re_escape(cls, s, pos, toks):
raise cls("Error in regex escape", None, None, pos, toks)
@classmethod
def _re_end(cls, s, pos, toks):
raise cls("Unfinished regex", None, None, pos, toks)
@classmethod
def _unexpected_tok(cls, s, pos, toks):
raise cls("Unexpected token", None, None, pos, toks)
class ValidationError(Exception):
"""Raised when the object tree doesn't validate against the given pattern."""
# FIXME TODO?
class _PatternElement:
def on_not_in_key(self, frame, path, defs):
raise NotImplementedError
def on_in_key(self, frame, path, defs):
raise NotImplementedError
def collect_params(self, res: list):
pass
class _Arrow(_PatternElement):
def on_not_in_key(self, frame, path, defs):
assert not path[-1].empty
path.append(_Holder(key=None, value=None, name=None, parent=path[-1].value, empty=True))
return False
class _StringKey(_PatternElement):
def __init__(self, toks):
self.key = toks[0]
self.skippable = toks[1] == '?'
def on_in_key(self, frame, path, defs):
return self.on_not_in_key(frame, path, defs)
def on_not_in_key(self, frame, path, defs):
path[-1].iterator = self.extract(path[-1].parent)
path[-1].empty = False
return True
def extract(self, obj):
try:
yield (self.key, obj[self.key])
except (TypeError, IndexError, KeyError):
if not self.skippable:
raise ValidationError
class _RegexKey(_PatternElement):
def __init__(self, toks):
self.key = toks[0]
self.compiled = re.compile(self.key)
self.skippable = toks[1] == '?'
def on_in_key(self, frame, path, defs):
return self.on_not_in_key(frame, path, defs)
def on_not_in_key(self, frame, path, defs):
filtered_iterator = self.filter(path[-1].iterator)
del path[-1].iterator
path[-1].iterator = filtered_iterator
del filtered_iterator
path[-1].empty = False
return True
def filter(self, it):
for el in it:
try:
if self.compiled.search(el[0]):
yield el
elif not self.skippable:
raise ValidationError
except TypeError:
if not self.skippable:
raise ValidationError
class _KeySubtree(_PatternElement):
def __init__(self, toks):
self.key = toks[0]
self.skippable = toks[1] == '?'
def on_not_in_key(self, frame, path, defs):
path[-1].subtree = True
filtered_iterator = self.filter(path[-1].iterator, defs)
del path[-1].iterator
path[-1].iterator = filtered_iterator
del filtered_iterator
path[-1].empty = False
return True
def filter(self, it, defs):
for x in it:
for y in _match_helper(self.key, defs, x[0]):
yield (y, x[1])
def collect_params(self, res: list):
for sub in self.key:
sub.collect_params(res)
class _ValueSubtree(_PatternElement):
def __init__(self, toks):
self.key = toks[0]
self.skippable = toks[1] == '?'
def on_not_in_key(self, frame, path, defs):
assert not path[-1].empty
path.append(_Holder(key=None, value=None, name=None, parent=path[-1].value, empty=False, subtree=True))
path[-1].iterator = self.filter(path[-1].parent, defs)
return True
def filter(self, parent, defs):
for x in _match_helper(self.key, defs, parent):
yield (x, parent)
def collect_params(self, res: list):
for sub in self.key:
sub.collect_params(res)
class _Ident(_PatternElement):
def __init__(self, toks):
self.key = toks[0]
def on_not_in_key(self, frame, path, defs):
path[-1].name = self.key
path[-1].empty = False
return True
class _Param(_PatternElement):
def __init__(self, toks):
assert isinstance(toks[1], _Ident)
self.skippable = toks[0] == '?'
self.key = toks[1].key
def on_in_key(self, frame, path, defs):
return self.on_not_in_key(frame, path, defs)
def on_not_in_key(self, frame, path, defs):
path[-1].iterator = self.extract(path[-1].parent, defs[self.key])
path[-1].empty = False
return True
def extract(self, obj, key):
try:
yield (key, obj[key])
except (TypeError, IndexError, KeyError):
if not self.skippable:
raise ValidationError
def collect_params(self, res: list):
res.append(self.key)
def get_value(self, defs):
return defs[self.key]
class _Ty(_PatternElement):
def __init__(self, toks):
assert isinstance(toks[1], _Ident)
self.skippable = toks[0] == '?'
self.key = toks[1].key
def on_in_key(self, frame, path, defs):
filtered_iterator = self.filter(path[-1].iterator, defs[self.key])
del path[-1].iterator
path[-1].iterator = filtered_iterator
del filtered_iterator
path[-1].empty = False
return True
def on_not_in_key(self, frame, path, defs):
assert len(path) == 1
if isinstance(path[-1].value, defs[self.key]):
return False
elif not self.skippable:
raise ValidationError
path.clear()
return False
def filter(self, it, ty):
for el in it:
# this may TypeError if ty is not a type nor a tuple of types
# but that's actually the programmer's error
if isinstance(el[1], ty):
yield el
elif not self.skippable:
# and this one is for actual validation
raise ValidationError
def collect_params(self, res: list):
res.append(self.key)
class _End(_PatternElement):
def on_in_key(self, frame, path, defs):
try:
path[-1].next()
return False
except StopIteration:
path.pop()
while frame.prev() and not isinstance(frame.current_op, _End):
pass
if not frame.prev():
# FIXME?
path.clear()
return True # FIXME?
def _build_syntax():
from pyparsing import Suppress, Literal, Forward, CharsNotIn, StringEnd, Combine, Optional, Group, Word, srange, Empty
# original regex order: arrow, type/parameter/identifier, string, regex, failure
# better syntax: "arrow" changes from "value" to "key" and thus you need at least one key match before an arrow
subtree = Forward()
# where relevant, enforces match behaviour (skippable object tree branch vs required object tree branch)
skippable = Optional("?", default="")
# r"|'(?:%'|%%|%(?P<EES>.|$)|[^%'])*?(?:'|(?P<ES>$))\??" # string literals
str_literal = (Combine(Suppress("'")
+ (Suppress("%") + "'" | Suppress("%") + "%" | Literal("%") + (CharsNotIn("") | StringEnd()).setParseAction(PatternError._str_escape) | CharsNotIn("%'"))[...]
+ (Suppress("'") | StringEnd().setParseAction(PatternError._str_end))) + skippable).setParseAction(lambda toks: [_StringKey(toks)])
# r"|/(?:%/|%%|%(?P<EER>.|$)|[^%/])*?(?:/|(?P<ER>$))\??" # regex
re_literal = (Combine(Suppress("/")
+ (Suppress("%") + "/" | Suppress("%") + "%" | Literal("%") + (CharsNotIn("") | StringEnd()).setParseAction(PatternError._re_escape) | CharsNotIn("%/"))[...]
+ (Suppress("/") | StringEnd().setParseAction(PatternError._re_end))) + skippable).setParseAction(lambda toks: [_RegexKey(toks)])
arrow = Literal("->").setParseAction(lambda: [_Arrow()])
# r"|(?::\??)?\$?[A-Za-z][A-Za-z0-9]*" # identifiers, parameters and type matches
identifier = Word(srange("[A-Za-z_]"), srange("[A-Za-z0-9_]")).setParseAction(lambda toks: [_Ident(toks)])
parameter = (Suppress("$") + skippable + identifier).setParseAction(lambda toks: [_Param(toks)])
ty = (Suppress(":") + skippable + Suppress("$") + identifier).setParseAction(lambda toks: [_Ty(toks)])
# support for objects-as-keys
keysubtree = (Suppress("[") + Group(ty[...] + subtree) + (Suppress("]") | CharsNotIn("").setParseAction(PatternError._unexpected_tok) | StringEnd().setParseAction(PatternError._unexpected_tok)) + Optional("?", default="")).setParseAction(lambda toks: [_KeySubtree(toks)])
# represents key matching - switches from "key" to "value"
tag = (identifier + Optional(parameter | str_literal | re_literal | keysubtree) | parameter | str_literal | re_literal | keysubtree) + ty[...] + Empty().setParseAction(lambda: [_End()])
# multiple value matching
valuesubtree = (Suppress("(") + Group(subtree) + (Suppress(")") | CharsNotIn("").setParseAction(PatternError._unexpected_tok) | StringEnd().setParseAction(PatternError._unexpected_tok)) + Optional("?", default="")).setParseAction(lambda toks: [_ValueSubtree(toks)])
# arrow and tag, value subtree
subtree <<= (arrow + tag)[...] + (valuesubtree + Empty().setParseAction(lambda: [_End()]))[...]
return ((subtree | CharsNotIn("").setParseAction(PatternError._unexpected_tok)) + StringEnd()).parseWithTabs()
_built_syntax = _build_syntax()
def _pairs(o):
if isinstance(o, Mapping):
return iter(o.items())
elif isinstance(o, Sequence):
return iter(enumerate(o, 0))
elif isinstance(o, Set):
return iter(((e, e) for e in o))
else:
# maybe there's more stuff I can implement later
raise TypeError
class _Holder:
def __init__(self, key, value, name, parent=None, it=None, empty=False, subtree=False):
self.name = name
self.key = key
self.value = value
self.empty = empty
self._it = it
self.parent = parent
self.subtree = subtree
@property
def iterator(self):
if self._it is None:
self._it = _pairs(self.parent)
return self._it
@iterator.setter
def iterator(self, value):
assert self._it is None
self._it = value
@iterator.deleter
def iterator(self):
self._it = None
def next(self):
self.key, self.value = next(self.iterator)
class _Frame:
def __init__(self, ops):
self.ops = ops
self.pc = -1
def next(self):
pc = self.pc + 1
if pc >= len(self.ops):
return False
self.pc = pc
return True
@property
def current_op(self):
return self.ops[self.pc]
def prev(self):
pc = self.pc - 1
if pc < 0:
return False
self.pc = pc
return True
def _match_helper(ops, defs, tree):
frame = _Frame(ops)
path = [_Holder(key=None, value=tree, parent=None, it=iter(()), name=None)]
in_key = False
while path:
if not frame.next():
assert not path[-1].empty
res = {}
for h in path:
if h.subtree:
for name, kv in h.key.items():
res[name] = kv
elif h.name is not None:
res[h.name] = (h.key, h.value)
yield res
assert len(path) == 1 or isinstance(frame.current_op, _End)
frame.prev()
in_key = True
else:
op = frame.current_op
if in_key:
in_key = op.on_in_key(frame, path, defs)
else:
in_key = op.on_not_in_key(frame, path, defs)
class Pattern:
"""A compiled pattern.
"""
def __init__(self, pattern, defs):
try:
self._ops = _built_syntax.parseString(pattern)
except PatternError as e:
e._normalize(pattern, defs)
raise
else:
self._params = []
for op in self._ops:
op.collect_params(self._params)
self._defs = {param: defs[param] for param in self._params}
def match(self, obj):
"""Matches this compiled pattern against the given object.
Args:
obj: The object to match against.
Returns:
An iterator. This iterator yields ``(key, value)`` pairs
wrapped in a dict for each variable in the pattern.
"""
return _match_helper(self._ops, self._defs, obj)
def compile(pattern, defs={}):
"""Compiles the pattern and returns a compiled :py:class:`abdl.Pattern` object.
Args:
pattern (str): The pattern. Refer to module-level documentation for
pattern syntax.
defs (dict): The parameter list. Used by parameters in the pattern.
Returns:
Pattern: A compiled pattern object.
"""
# TODO caching
return Pattern(pattern, defs)
def match(pattern, obj, defs={}):
"""Matches the pattern against the given obj.
This method is equivalent to ``abdl.compile(pattern, defs).match(obj)``.
Args:
pattern (str): The pattern. Refer to module-level documentation for
pattern syntax.
obj: The object to match against.
defs (dict): The parameter list. Used by parameters in the pattern.
Returns:
An iterator. This iterator yields ``(key, value)`` pairs
wrapped in a dict for each variable in the pattern.
"""
return compile(pattern, defs).match(obj)