/* * This file is part of Datafu * Copyright (C) 2021 Soni L. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ use std::collections::BTreeMap; use std::marker::PhantomData; use regex::Regex; use crate::KVPair; use crate::RefOwn; use crate::PatternTypes; use crate::Predicate; use crate::errors::MatchError; pub(crate) const MAX_CALLS: usize = 250; type Matches<'a, 'b, T> = BTreeMap<&'a str, KVPair<'b, T>>; // TODO: use a builder for this? /// The constant pool for a pattern. pub(crate) struct PatternConstants { // last proto is implicitly the whole pattern. pub(crate) protos: Vec>>, // Note that we can borrow these when creating the output map. // https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=da26f9175e96273fa0b94971a4e6172f pub(crate) strings: Vec, pub(crate) regices: Vec, pub(crate) predicates: Vec>>, // NOTE these are part of the constant pool and so have lifetime analogous // to 'a (consistently used to indicate constant pool lifetime) when used // elsewhere. In particular, they can't be yielded by the iterator. pub(crate) defs: Vec, } impl Default for PatternConstants { fn default() -> Self { Self { protos: Default::default(), strings: Default::default(), regices: Default::default(), predicates: Default::default(), defs: Default::default(), } } } /// A pattern element. pub(crate) enum PatternElement { Arrow, Identifier(usize), StringKey(usize, bool), RegexKey(usize, bool), ParameterKey(usize, bool), KeySubtree(usize, bool), ValueSubtree(usize, bool), ApplyPredicate(usize, bool, PhantomData) -> &Predicate>), End } impl Copy for PatternElement {} impl Clone for PatternElement { fn clone(&self) -> Self { *self } } struct Frame<'a, 'b, T: PatternTypes> { obj: RefOwn<'b, T::Ref, T::Own>, ops: &'a Vec>, iar: Option, depth: usize, path: Vec>, in_key: bool, } impl<'a, 'b, T: PatternTypes> Frame<'a, 'b, T> { /// Advances the instruction address register. /// /// # Returns /// /// `true` if successful, `false` otherwise. fn next(&mut self) -> bool { let new = self.iar.map_or(0, |v| v + 1); new < self.ops.len() && { self.iar = Some(new); true } } /// Returns the current instruction. fn op(&self) -> PatternElement { self.ops[self.iar.expect("ops[iar]")] } /// Rewinds the instruction address register. /// /// # Returns /// /// `true` if successful, `false` otherwise. fn prev(&mut self) -> bool { let new = self.iar.expect("iar").checked_sub(1); new.is_some() && { self.iar = new; true } } } /// Stores a single match. /// /// See also Holder. enum HolderState<'a, 'b, T: PatternTypes> { /// Empty holder, for KV pair. EmptyKey, /// Empty holder, for subtree. EmptySubtree, /// Non-empty KV pair. Key(KVPair<'b, T>), /// Non-empty subtree. Subtree(Matches<'a, 'b, T>, RefOwn<'b, T::Ref, T::Own>), /// Preloaded value. Usually the first holder in a frame. Value(RefOwn<'b, T::Ref, T::Own>), } impl<'a, 'b, T: PatternTypes> Clone for HolderState<'a, 'b, T> { fn clone(&self) -> Self { match self { HolderState::EmptyKey => HolderState::EmptyKey, HolderState::EmptySubtree => HolderState::EmptySubtree, HolderState::Key(v) => HolderState::Key(*v), HolderState::Subtree(m, v) => HolderState::Subtree(m.clone(), *v), HolderState::Value(v) => HolderState::Value(*v), } } } impl<'a, 'b, T: PatternTypes> HolderState<'a, 'b, T> { fn is_empty(&self) -> bool { matches!(self, HolderState::EmptyKey | HolderState::EmptySubtree) } fn has_value(&self) -> bool { !self.is_empty() } fn is_subtree(&self) -> bool { matches!(self, HolderState::EmptySubtree | HolderState::Subtree {..}) } fn value(&self) -> Option> { match *self { HolderState::Key((_, value)) => Some(value), HolderState::Subtree(_, value) => Some(value), HolderState::Value(value) => Some(value), _ => None } } fn key(&self) -> Option> { match *self { HolderState::Key((key, _)) => Some(key), _ => None } } fn clear(&mut self) { match *self { HolderState::Key(_) => *self = HolderState::EmptyKey, HolderState::Subtree(_, _) => *self = HolderState::EmptySubtree, HolderState::Value(_) => unreachable!(), _ => {}, }; assert!(self.is_empty()); } } /// Stores a single match and associated metadata. /// /// A single match is generally a key-value pair, but may be a collection of /// named pairs in the case of subtree matches, or just a value for the initial /// holder. struct Holder<'a, 'b, T: PatternTypes> { name: Option<&'a str>, value: HolderState<'a, 'b, T>, parent: Option>, iterator: Option> + 'b>>, filters: Vec Fn(&'c mut HolderState<'a, 'b, T>) -> Result<(), MatchError>) + 'a>>, } impl<'a, 'b, T: PatternTypes> Holder<'a, 'b, T> { fn next(&mut self) -> Result { self.ensure_iterator()?; match self { Self { value: ref mut v, iterator: Some(ref mut it), ref filters, .. } => { let is_subtree = v.is_subtree(); let mut next_v; loop { next_v = match it.next() { Some(pair) => HolderState::Key(pair), None => return Ok(false) }; for filter in filters { filter(&mut next_v)?; if next_v.is_empty() { break; } } if next_v.has_value() { break; } } assert!(next_v.has_value()); assert!(next_v.is_subtree() == is_subtree); *v = next_v; Ok(true) }, _ => unreachable!() } } /// Ensure `self.iterator.is_some()`, creating an iterator if necessary. fn ensure_iterator(&mut self) -> Result<(), MatchError> { if self.iterator.is_none() { let iter = T::pairs(self.parent.unwrap()); if iter.is_none() { return Err(MatchError::UnsupportedOperation); } self.iterator = iter; } assert!(self.iterator.is_some()); Ok(()) } } impl<'a, 'b, T: PatternTypes> Default for Holder<'a, 'b, T> { fn default() -> Self { Self { name: Default::default(), value: HolderState::EmptyKey, parent: Default::default(), iterator: Default::default(), filters: Default::default(), } } } pub struct Matcher<'a, 'b, T: PatternTypes> { defs: &'a PatternConstants, frame: Frame<'a, 'b, T>, } // TODO: // // [x] Arrow // [x] StringKey // [x] RegexKey // [ ] KeySubtree // [ ] ValueSubtree // [x] Ident // [ ] Param // [x] ApplyPredicate // [x] End /// Helper for `PatternElement::StringKey`. fn on_string_key<'a, 'b, T: PatternTypes>( matcher: &mut Matcher<'a, 'b, T>, id: usize, skippable: bool, ) -> Result { let path = matcher.frame.path.last_mut().unwrap(); assert!(path.iterator.is_none()); let key = &matcher.defs.strings[id]; let iter = T::get(path.parent.unwrap(), RefOwn::Str(key)); match iter { None => Err(MatchError::UnsupportedOperation), Some(opt) => { path.iterator = Some(Box::new(opt.into_iter())); Ok(true) } } } /// Helper for `PatternElement::RegexKey`. fn on_regex_key<'a, 'b, T: PatternTypes>( matcher: &mut Matcher<'a, 'b, T>, id: usize, skippable: bool, ) -> Result { matcher.frame.path.last_mut().unwrap().ensure_iterator()?; let re = &matcher.defs.regices[id]; let path = matcher.frame.path.last_mut().unwrap(); path.filters.push(Box::new(move |value| { let s = T::as_str(value.key().unwrap()); match (s.map_or(false, |s| re.is_match(s)), skippable) { (true, _) => Ok(()), (false, true) => { value.clear(); Ok(()) }, (false, false) => Err(MatchError::ValidationError), } })); Ok(true) } impl<'a, 'b, T: PatternTypes> Matcher<'a, 'b, T> { pub(crate) fn new(obj: RefOwn<'b, T::Ref, T::Own>, defs: &'a PatternConstants, proto: usize, rlimit: usize) -> Result { let depth = rlimit.checked_sub(1).ok_or(MatchError::StackOverflow)?; let ops: &Vec<_> = &defs.protos[proto]; Ok(Self { defs: defs, frame: Frame { obj: obj, ops: ops, iar: None, depth: depth, path: if ops.is_empty() { // TODO decide on empty matcher behaviour todo!() } else { let mut holder = Holder::default(); holder.value = HolderState::Value(obj); holder.iterator = Some(Box::new(std::iter::empty())); vec![holder] }, in_key: false, }, }) } fn on_in_key(&mut self) -> Result { match self.frame.op() { PatternElement::End => { let path = self.frame.path.last_mut().unwrap(); if path.next()? { Ok(false) } else { drop(path); self.frame.path.pop().unwrap(); // stop at previous End, or start of frame while self.frame.prev() { if matches!(self.frame.op(), PatternElement::End) { break; } } // is start of frame? if !self.frame.prev() { self.frame.path.clear(); } Ok(true) } }, PatternElement::ApplyPredicate(id, skippable, _) => { // failing on T::get() is already handled, but we may need a // T::pairs(). construct it here. self.frame.path.last_mut().unwrap().ensure_iterator()?; let pred = &self.defs.predicates[id]; let path = self.frame.path.last_mut().unwrap(); path.filters.push(Box::new(move |value| { match (pred(value.value().unwrap()), skippable) { (true, _) => Ok(()), (false, true) => { value.clear(); Ok(()) }, (false, false) => Err(MatchError::ValidationError), } })); Ok(true) }, PatternElement::StringKey(id, skippable) => { on_string_key(self, id, skippable) }, PatternElement::RegexKey(id, skippable) => { on_regex_key(self, id, skippable) }, _ => unreachable!("on_in_key") } } fn on_not_in_key(&mut self) -> Result { match self.frame.op() { PatternElement::Arrow => { // this *should* always pass. assert!(self.frame.path.last().unwrap().iterator.is_some()); let mut holder = Holder::default(); holder.parent = self.frame.path.last().unwrap().value.value(); assert!(holder.parent.is_some()); self.frame.path.push(holder); Ok(false) }, PatternElement::Identifier(id) => { let name = self.defs.strings.get(id).map(|s| &**s); let path = self.frame.path.last_mut().unwrap(); path.name = name; assert!(path.iterator.is_none()); // we don't actually create the iterator here, // as we may still wanna use T::get() instead. Ok(true) }, PatternElement::ApplyPredicate(id, skippable, _) => { assert!(self.frame.path.len() == 1); let pred = &self.defs.predicates[id]; let value = self.frame.path.last().unwrap().value.value(); match (pred(value.unwrap()), skippable) { (true, _) => Ok(false), (false, true) => { self.frame.path.clear(); // any Ok(_) will do Ok(false) }, (false, false) => Err(MatchError::ValidationError), } }, PatternElement::StringKey(id, skippable) => { on_string_key(self, id, skippable) }, PatternElement::RegexKey(id, skippable) => { on_regex_key(self, id, skippable) }, _ => unreachable!("on_not_in_key") } } fn on_end(&mut self) -> Result<(bool, Matches<'a, 'b, T>), MatchError> { match self.frame.op() { PatternElement::End => { assert!(!self.frame.path.last().expect("path").value.is_empty()); // TODO this for loop is duplicated with ApplyPredicate // TODO figure out how to deduplicate them let mut res: Matches<'a, 'b, T> = Default::default(); for holder in &self.frame.path { match holder.value { HolderState::Subtree(ref matches, _) => { res.extend(matches); }, HolderState::Key(pair) => { if let Some(name) = holder.name { res.insert(name, pair); } }, HolderState::Value(_) => (), _ => unreachable!("on_end (End)"), } } if !self.frame.prev() { // frame.prev() should always be called, even if this gets // replaced with debug_assert!(). assert!(false, "frame.prev()"); } Ok((true, res)) } PatternElement::ApplyPredicate {..} => { assert!(!self.frame.in_key); let mut res: Matches<'a, 'b, T> = Default::default(); for holder in &self.frame.path { match holder.value { HolderState::Subtree(ref matches, _) => { res.extend(matches); }, HolderState::Key(pair) => { if let Some(name) = holder.name { res.insert(name, pair); } }, HolderState::Value(_) => (), _ => unreachable!("on_end (ApplyPredicate)"), } } self.frame.path.clear(); Ok((false, res)) } _ => unreachable!("on_end") } } } impl<'a, 'b, T: PatternTypes> Iterator for Matcher<'a, 'b, T> { type Item = Result>, MatchError>; fn next(&mut self) -> Option { while !self.frame.path.is_empty() { if !self.frame.next() { let (in_key, res) = match self.on_end() { Ok(v) => v, Err(e) => { self.frame.path.clear(); return Some(Err(e)) }, }; self.frame.in_key = in_key; return Some(Ok(res)); } else { let in_key = if self.frame.in_key { self.on_in_key() } else { self.on_not_in_key() }; match in_key { Ok(in_key) => self.frame.in_key = in_key, Err(e) => { self.frame.path.clear(); return Some(Err(e)) }, } } } None } }