# A Boneless Datastructure Language # Copyright (C) 2019 Soni L. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """A Boneless Datastructure Language, version 2.0.1. ABDL expressions are regex-like constructs for matching and validating object structures. They can be used with JSON and similar formats, and even self-referential data structures. Language Reference: ABDL expressions have the ability to iterate, index, validate and filter data structures. This is done with the use of the syntax elements listed below. Syntax Elements: An ABDL expression is a sequence of zero or more sequences starting with arrows followed by zero or more subvalues. An arrow is ``->`` and indicates indexing/iteration (Mappings, Sequences, Sets). It must be followed by a variable, literal, parameter, regex or key match. A variable is a string of alphanumeric characters, not starting with a digit. It may be followed by a literal, parameter, regex, key match, or one or more type tests. A ``(key, value)`` tuple containing the corresponding matched element will be identified by this name in the results dict. A literal is a string delimited by single quotes (use ``%'`` to escape ``'`` and ``%%`` to escape ``%``). A literal can be made "non-validating" by appending an ``?`` after it. It may be followed by one or more type tests. It is exactly equivalent to indexing an object with a string key. A parameter is the symbol ``$`` followed by a string of alphanumeric characters, not starting with a digit. A parameter can be made "non-validating" by appending an ``?`` after it. It may be followed by one or more type tests. It is exactly equivalent to indexing an object with an arbitrary object key. A regex is an RE, as defined by the ``re`` module, delimited by forward slashes (use ``%/`` to escape ``/`` and ``%%`` to escape ``%``). A regex can be made "non-validating" by appending an ``?`` after it. It may be followed by one or more type tests. It attempts to match each key in the object. A type test is ``:`` followed by a parameter. A type test can be made "non-validating" by appending an ``?`` after the ``:``. It attempts to match the type of each matched value in the object. A key match is an ABDL expression enclosed in ``[`` and ``]``, optionally prefixed with one or more type tests. This matches keys (including the type tests). A subvalue is an ABDL expression enclosed in ``(`` and ``)``. This allows matching multiple values on the same object. Some syntax elements can be validating or non-validating. Validating syntax elements will raise a :py:exc:`abdl.ValidationError` whenever a non-matching element is encountered, whereas non-validating ones will skip them. Note that it is possible for a validating syntax element to still yield results before raising a :py:exc:`abdl.ValidationError`, so one needs to be careful when writing code where such behaviour could result in a security vulnerability. Examples: >>> import abdl >>> for m in abdl.match("->X:?$dict->Y", {"foo": 1, "bar": {"baz": 2}}, {'dict': dict}): ... print(m['X'][0], m['Y'][0], m['Y'][1]) bar baz 2 >>> pat = abdl.compile('''-> 'projects'? ... -> commit /[0-9a-fA-F]{40}|[0-9a-fA-F]{64}/? :?$dict ... -> url :?$dict ... -> branch :?$dict''', {'dict': dict}) >>> data = {"projects": { ... "385e734a52e13949a7a5c71827f6de920dbfea43": { ... "https://soniex2.autistic.space/git-repos/ganarchy.git": {"HEAD": {"active": True}} ... } ... }} >>> for m in pat.match(data): ... print(m['commit'][0], m['url'][0], m['branch'][0], m['branch'][1]) 385e734a52e13949a7a5c71827f6de920dbfea43 https://soniex2.autistic.space/git-repos/ganarchy.git HEAD {'active': True} (If ``:?$dict`` wasn't present, a TypeError would be raised when trying to iterate the ``1`` from ``"foo": 1``.) """ import re from collections.abc import Mapping, Sequence, Iterator, Set class DeprecationError(Exception): """Raised for deprecated features, if they are disabled. This class controls warning/error behaviour of deprecated features.""" #enable_key_match_compat = False #warn_key_match_compat = False @classmethod def warn_all(cls): """Enables all deprecation warnings.""" pass class PatternError(Exception): """Raised for invalid input or output expressions.""" # TODO implement formatting def __init__(self, msg, pattern, defs, pos, toks): self.msg = msg self.pattern = pattern self.defs = defs self.pos = pos self._toks = toks # internal use def _normalize(self, pattern, defs): if pattern is not None: if self.pattern is not None: raise ValueError("Attempt to normalize normalized pattern") else: self.pattern = pattern if defs is not None: if self.defs is not None: raise ValueError("Attempt to normalize normalized defs") else: self.defs = defs @classmethod def _str_escape(cls, s, pos, toks): raise cls("Error in string escape", None, None, pos, toks) @classmethod def _str_end(cls, s, pos, toks): raise cls("Unfinished string", None, None, pos, toks) @classmethod def _re_escape(cls, s, pos, toks): raise cls("Error in regex escape", None, None, pos, toks) @classmethod def _re_end(cls, s, pos, toks): raise cls("Unfinished regex", None, None, pos, toks) @classmethod def _unexpected_tok(cls, s, pos, toks): raise cls("Unexpected token", None, None, pos, toks) class ValidationError(Exception): """Raised when the object tree doesn't validate against the given pattern.""" # FIXME TODO? class _PatternElement: def on_not_in_key(self, frame, path, defs): raise NotImplementedError def on_in_key(self, frame, path, defs): raise NotImplementedError def collect_params(self, res: list): pass class _Arrow(_PatternElement): def on_not_in_key(self, frame, path, defs): assert not path[-1].empty path.append(_Holder(key=None, value=None, name=None, parent=path[-1].value, empty=True)) return False class _StringKey(_PatternElement): def __init__(self, toks): self.key = toks[0] self.skippable = toks[1] == '?' def on_in_key(self, frame, path, defs): return self.on_not_in_key(frame, path, defs) def on_not_in_key(self, frame, path, defs): path[-1].iterator = self.extract(path[-1].parent) path[-1].empty = False return True def extract(self, obj): try: yield (self.key, obj[self.key]) except (TypeError, IndexError, KeyError): if not self.skippable: raise ValidationError class _RegexKey(_PatternElement): def __init__(self, toks): self.key = toks[0] self.compiled = re.compile(self.key) self.skippable = toks[1] == '?' def on_in_key(self, frame, path, defs): return self.on_not_in_key(frame, path, defs) def on_not_in_key(self, frame, path, defs): filtered_iterator = self.filter(path[-1].iterator) del path[-1].iterator path[-1].iterator = filtered_iterator del filtered_iterator path[-1].empty = False return True def filter(self, it): for el in it: try: if self.compiled.search(el[0]): yield el elif not self.skippable: raise ValidationError except TypeError: if not self.skippable: raise ValidationError class _KeySubtree(_PatternElement): def __init__(self, toks): self.key = toks[0] self.skippable = toks[1] == '?' def on_not_in_key(self, frame, path, defs): path[-1].subtree = True filtered_iterator = self.filter(path[-1].iterator, defs) del path[-1].iterator path[-1].iterator = filtered_iterator del filtered_iterator path[-1].empty = False return True def filter(self, it, defs): for x in it: for y in _match_helper(self.key, defs, x[0]): yield (y, x[1]) def collect_params(self, res: list): for sub in self.key: sub.collect_params(res) class _ValueSubtree(_PatternElement): def __init__(self, toks): self.key = toks[0] self.skippable = toks[1] == '?' def on_not_in_key(self, frame, path, defs): assert not path[-1].empty path.append(_Holder(key=None, value=None, name=None, parent=path[-1].value, empty=False, subtree=True)) path[-1].iterator = self.filter(path[-1].parent, defs) return True def filter(self, parent, defs): for x in _match_helper(self.key, defs, parent): yield (x, parent) def collect_params(self, res: list): for sub in self.key: sub.collect_params(res) class _Ident(_PatternElement): def __init__(self, toks): self.key = toks[0] def on_not_in_key(self, frame, path, defs): path[-1].name = self.key path[-1].empty = False return True class _Param(_PatternElement): def __init__(self, toks): assert isinstance(toks[1], _Ident) self.skippable = toks[0] == '?' self.key = toks[1].key def on_in_key(self, frame, path, defs): return self.on_not_in_key(frame, path, defs) def on_not_in_key(self, frame, path, defs): path[-1].iterator = self.extract(path[-1].parent, defs[self.key]) path[-1].empty = False return True def extract(self, obj, key): try: yield (key, obj[key]) except (TypeError, IndexError, KeyError): if not self.skippable: raise ValidationError def collect_params(self, res: list): res.append(self.key) def get_value(self, defs): return defs[self.key] class _Ty(_PatternElement): def __init__(self, toks): assert isinstance(toks[1], _Ident) self.skippable = toks[0] == '?' self.key = toks[1].key def on_in_key(self, frame, path, defs): filtered_iterator = self.filter(path[-1].iterator, defs[self.key]) del path[-1].iterator path[-1].iterator = filtered_iterator del filtered_iterator path[-1].empty = False return True def on_not_in_key(self, frame, path, defs): assert len(path) == 1 if isinstance(path[-1].value, defs[self.key]): return False elif not self.skippable: raise ValidationError path.clear() return False def filter(self, it, ty): for el in it: # this may TypeError if ty is not a type nor a tuple of types # but that's actually the programmer's error if isinstance(el[1], ty): yield el elif not self.skippable: # and this one is for actual validation raise ValidationError def collect_params(self, res: list): res.append(self.key) class _End(_PatternElement): def on_in_key(self, frame, path, defs): try: path[-1].next() return False except StopIteration: path.pop() while frame.prev() and not isinstance(frame.current_op, _End): pass if not frame.prev(): # FIXME? path.clear() return True # FIXME? def _build_syntax(): from pyparsing import Suppress, Literal, Forward, CharsNotIn, StringEnd, Combine, Optional, Group, Word, srange, Empty # original regex order: arrow, type/parameter/identifier, string, regex, failure # better syntax: "arrow" changes from "value" to "key" and thus you need at least one key match before an arrow subtree = Forward() # where relevant, enforces match behaviour (skippable object tree branch vs required object tree branch) skippable = Optional("?", default="") # r"|'(?:%'|%%|%(?P.|$)|[^%'])*?(?:'|(?P$))\??" # string literals str_literal = (Combine(Suppress("'") + (Suppress("%") + "'" | Suppress("%") + "%" | Literal("%") + (CharsNotIn("") | StringEnd()).setParseAction(PatternError._str_escape) | CharsNotIn("%'"))[...] + (Suppress("'") | StringEnd().setParseAction(PatternError._str_end))) + skippable).setParseAction(lambda toks: [_StringKey(toks)]) # r"|/(?:%/|%%|%(?P.|$)|[^%/])*?(?:/|(?P$))\??" # regex re_literal = (Combine(Suppress("/") + (Suppress("%") + "/" | Suppress("%") + "%" | Literal("%") + (CharsNotIn("") | StringEnd()).setParseAction(PatternError._re_escape) | CharsNotIn("%/"))[...] + (Suppress("/") | StringEnd().setParseAction(PatternError._re_end))) + skippable).setParseAction(lambda toks: [_RegexKey(toks)]) arrow = Literal("->").setParseAction(lambda: [_Arrow()]) # r"|(?::\??)?\$?[A-Za-z][A-Za-z0-9]*" # identifiers, parameters and type matches identifier = Word(srange("[A-Za-z_]"), srange("[A-Za-z0-9_]")).setParseAction(lambda toks: [_Ident(toks)]) parameter = (Suppress("$") + skippable + identifier).setParseAction(lambda toks: [_Param(toks)]) ty = (Suppress(":") + skippable + Suppress("$") + identifier).setParseAction(lambda toks: [_Ty(toks)]) # support for objects-as-keys keysubtree = (Suppress("[") + Group(ty[...] + subtree) + (Suppress("]") | CharsNotIn("").setParseAction(PatternError._unexpected_tok) | StringEnd().setParseAction(PatternError._unexpected_tok)) + Optional("?", default="")).setParseAction(lambda toks: [_KeySubtree(toks)]) # represents key matching - switches from "key" to "value" tag = (identifier + Optional(parameter | str_literal | re_literal | keysubtree) | parameter | str_literal | re_literal | keysubtree) + ty[...] + Empty().setParseAction(lambda: [_End()]) # multiple value matching valuesubtree = (Suppress("(") + Group(subtree) + (Suppress(")") | CharsNotIn("").setParseAction(PatternError._unexpected_tok) | StringEnd().setParseAction(PatternError._unexpected_tok)) + Optional("?", default="")).setParseAction(lambda toks: [_ValueSubtree(toks)]) # arrow and tag, value subtree subtree <<= (arrow + tag)[...] + (valuesubtree + Empty().setParseAction(lambda: [_End()]))[...] return ((subtree | CharsNotIn("").setParseAction(PatternError._unexpected_tok)) + StringEnd()).parseWithTabs() _built_syntax = _build_syntax() def _pairs(o): if isinstance(o, Mapping): return iter(o.items()) elif isinstance(o, Sequence): return iter(enumerate(o, 0)) elif isinstance(o, Set): return iter(((e, e) for e in o)) else: # maybe there's more stuff I can implement later raise TypeError class _Holder: def __init__(self, key, value, name, parent=None, it=None, empty=False, subtree=False): self.name = name self.key = key self.value = value self.empty = empty self._it = it self.parent = parent self.subtree = subtree @property def iterator(self): if self._it is None: self._it = _pairs(self.parent) return self._it @iterator.setter def iterator(self, value): assert self._it is None self._it = value @iterator.deleter def iterator(self): self._it = None def next(self): self.key, self.value = next(self.iterator) class _Frame: def __init__(self, ops): self.ops = ops self.pc = -1 def next(self): pc = self.pc + 1 if pc >= len(self.ops): return False self.pc = pc return True @property def current_op(self): return self.ops[self.pc] def prev(self): pc = self.pc - 1 if pc < 0: return False self.pc = pc return True def _match_helper(ops, defs, tree): frame = _Frame(ops) path = [_Holder(key=None, value=tree, parent=None, it=iter(()), name=None)] in_key = False while path: if not frame.next(): assert not path[-1].empty res = {} for h in path: if h.subtree: for name, kv in h.key.items(): res[name] = kv elif h.name is not None: res[h.name] = (h.key, h.value) yield res assert len(path) == 1 or isinstance(frame.current_op, _End) frame.prev() in_key = True else: op = frame.current_op if in_key: in_key = op.on_in_key(frame, path, defs) else: in_key = op.on_not_in_key(frame, path, defs) class Pattern: """A compiled pattern object. Warning: Do not create instances of this class manually. Use :py:func:`abdl.compile`. """ def __init__(self, pattern, defs): try: self._ops = _built_syntax.parseString(pattern) except PatternError as e: e._normalize(pattern, defs) raise else: self._params = [] for op in self._ops: op.collect_params(self._params) self._defs = {param: defs[param] for param in self._params} def match(self, obj): """Matches this compiled pattern against the given object. Args: obj: The object to match against. Returns: An iterator. This iterator yields ``(key, value)`` pairs wrapped in a dict for each variable in the pattern. """ return _match_helper(self._ops, self._defs, obj) def compile(pattern, defs={}): """Compiles the pattern and returns a compiled :py:class:`abdl.Pattern` object. Args: pattern (str): The pattern. Refer to module-level documentation for pattern syntax. defs (dict): The parameter list. Used by parameters in the pattern. Returns: Pattern: A compiled pattern object. """ # TODO caching return Pattern(pattern, defs) def match(pattern, obj, defs={}): """Matches the pattern against the given obj. This method is equivalent to ``abdl.compile(pattern, defs).match(obj)``. Args: pattern (str): The pattern. Refer to module-level documentation for pattern syntax. obj: The object to match against. defs (dict): The parameter list. Used by parameters in the pattern. Returns: An iterator. This iterator yields ``(key, value)`` pairs wrapped in a dict for each variable in the pattern. """ return compile(pattern, defs).match(obj)