/* * This file is part of Datafu * Copyright (C) 2021 Soni L. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ use std::borrow::Borrow; use std::collections::BTreeMap; use std::mem::ManuallyDrop; use impl_trait::impl_trait; use regex::Regex; use crate::PatternTypes; use crate::Predicate; use crate::errors::PatternError; use crate::vm::PatternConstants; use crate::vm::PatternElement; /// try! with bools. (the b comes from bool.) macro_rules! bry { ($l:lifetime $e:expr) => { if !$e { break $l false; } }; ($e:expr) => { if !$e { return false; } }; } // the following macros rely on unlabeled-break-through-labeled-block being an // error. // NOTE: always test changes to this module on nightly! #[cfg(not(feature = "stable"))] /// labeled block. on nightly: better compile errors. but also works on stable. macro_rules! lblock { // unfortunately ($l:lifetime : $b:block) => { $l: $b } didn't work. ($($t:tt)+) => { $($t)+ } } #[cfg(feature = "stable")] /// labeled block. on nightly: better compile errors. but also works on stable. macro_rules! lblock { ($l:lifetime : $b:block) => { $l: loop { break $b } } } // can't use Pattern here :( fn strip_prefix(s: &mut &str, prefix: &str) -> bool { s.strip_prefix(prefix).map(|ns| *s = ns).is_some() } fn pos_of<'a>(base: &'a str, sub: &'a str) -> Option { // FIXME Some((sub.as_ptr() as usize) - (base.as_ptr() as usize)) } struct SubtreeHelper<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> where Self: 'r { root: &'r mut Parser<'s, P, O, T>, } impl_trait! { impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> SubtreeHelper<'r, 's, P, O, T> where Self: 'r { fn start(value: &'r mut Parser<'s, P, O, T>) -> Self { value.consts.protos.push(Default::default()); Self { root: value, } } fn commit(self) -> usize { let mut self_ = ManuallyDrop::new(self); let proto = self_.root.consts.protos.pop().unwrap(); let id = self_.root.closed_subtrees.next().unwrap(); self_.root.consts.protos.insert(id, proto); id } impl trait std::ops::Deref { type Target = Parser<'s, P, O, T>; fn deref(&self) -> &Self::Target { &*self.root } } impl trait std::ops::DerefMut { fn deref_mut(&mut self) -> &mut Self::Target { self.root } } impl trait Drop { fn drop(&mut self) { // remove "partial" proto self.root.consts.protos.pop().expect("SubtreeHelper"); } } } } struct TagHelper<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> where Self: 'r { root: &'r mut Parser<'s, P, O, T>, len: usize, } impl_trait! { impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> TagHelper<'r, 's, P, O, T> where Self: 'r { fn start(value: &'r mut Parser<'s, P, O, T>) -> Self { let len = value.consts.protos.last().unwrap().len(); Self { root: value, len : len, } } fn commit(self) { std::mem::forget(self) } impl trait std::ops::Deref { type Target = Parser<'s, P, O, T>; fn deref(&self) -> &Self::Target { &*self.root } } impl trait std::ops::DerefMut { fn deref_mut(&mut self) -> &mut Self::Target { self.root } } impl trait Drop { fn drop(&mut self) { let proto = self.root.consts.protos.last_mut().unwrap(); assert!(proto.len() >= self.len); while proto.len() > self.len { let _ = proto.pop(); } } } } } struct Parser<'s, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> { base: &'s str, preds: Option>>>, objs: Option>, pred_ids: BTreeMap, obj_ids: BTreeMap, consts: PatternConstants, closed_subtrees: std::ops::RangeFrom, } // These are documented using LPeg.re syntax // http://www.inf.puc-rio.br/~roberto/lpeg/re.html #[rustfmt::skip] impl<'s, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> Parser<'s, P, O, T> { /// str_literal <- sp ( ( "'" str_char* ( "'" / ( !. -> ErrorStrEnd ) ) ( '?' -> MarkSkippable ) ) -> String ) sp /// str_char <- ( str_escape / [^%'] ) /// str_escape <- '%' ( '%' / "'" ) / ( ( '%' .? ) -> ErrorStrEscape ) fn str_literal(&mut self, s: &mut &'s str) -> Result> { const STRING_TOKEN: &str = "'"; let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, STRING_TOKEN)); let mut string = String::new(); loop { let base = cursor; if strip_prefix(&mut cursor, "%") { if strip_prefix(&mut cursor, STRING_TOKEN) { string.push_str(STRING_TOKEN); } else if strip_prefix(&mut cursor, "%") { string.push_str("%"); } else { return Err(PatternError::StringEscape(pos_of(self.base, base).unwrap(), cursor)); } } else { cursor = cursor.trim_start_matches(|c: char| { !matches!(c, '%' | '\'') }); if base == cursor { // no match? break; } string.push_str(&base[..pos_of(base, cursor).unwrap()]); } } if !strip_prefix(&mut cursor, STRING_TOKEN) { match cursor { "" => return Err(PatternError::StringEnd(self.base.len(), cursor)), _ => unreachable!(), } } let skippable = strip_prefix(&mut cursor, "?"); self.sp(&mut cursor); let id = self.consts.strings.iter().position(|c| c == &string); let self_ = &mut *self; let id = id.unwrap_or_else(move || { string.shrink_to_fit(); self_.consts.strings.push(string); self_.consts.strings.len() - 1 }); let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::StringKey(id, skippable)); *s = cursor; true })) } /// re_literal <- sp ( ( "/" re_char* ( "/" / ( !. -> ErrorReEnd ) ) ( '?' -> MarkSkippable ) ) -> Regex ) sp /// re_char <- ( re_escape / [^%/] ) /// re_escape <- '%' ( '%' / "/" ) / ( ( '%' .? ) -> ErrorReEscape ) #[allow(unused_variables)] #[allow(unreachable_code)] fn re_literal(&mut self, s: &mut &'s str) -> Result> { const RE_TOKEN: &str = "/"; let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); let orig = cursor; bry!('matches strip_prefix(&mut cursor, RE_TOKEN)); let mut string = String::new(); loop { let base = cursor; if strip_prefix(&mut cursor, "%") { if strip_prefix(&mut cursor, RE_TOKEN) { string.push_str(RE_TOKEN); } else if strip_prefix(&mut cursor, "%") { string.push_str("%"); } else { return Err(PatternError::RegexEscape(pos_of(self.base, base).unwrap(), cursor)); } } else { cursor = cursor.trim_start_matches(|c: char| { !matches!(c, '%' | '/') }); if base == cursor { // no match? break; } string.push_str(&base[..pos_of(base, cursor).unwrap()]); } } if !strip_prefix(&mut cursor, RE_TOKEN) { match cursor { "" => return Err(PatternError::RegexEnd(self.base.len(), cursor)), _ => unreachable!(), } } let end = cursor; let skippable = strip_prefix(&mut cursor, "?"); self.sp(&mut cursor); let re = Regex::new(&string).map_err(|e| { PatternError::Regex(pos_of(self.base, orig).unwrap(), &orig[..pos_of(orig, end).unwrap()], e) })?; let id = self.consts.regices.iter().position(|c| c.as_str() == re.as_str()); let self_ = &mut *self; let id = id.unwrap_or_else(move || { self_.consts.regices.push(re); self_.consts.regices.len() - 1 }); let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::RegexKey(id, skippable)); *s = cursor; true })) } /// matcher <- sp ( parameter / str_literal / re_literal / key_subtree ) sp fn matcher(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); match () { _ if self.parameter(&mut cursor)? => {}, _ if self.str_literal(&mut cursor)? => {}, _ if self.re_literal(&mut cursor)? => {}, _ if self.key_subtree(&mut cursor)? => {}, _ => bry!('matches false), } self.sp(&mut cursor); *s = cursor; true })) } /// tag <- sp ( '->' -> Arrow ) sp ( matcher / name sp matcher? ) sp predicate* ( '' -> End ) sp fn tag(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "->")); let mut self_ = TagHelper::start(&mut *self); { let proto = self_.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::Arrow); } self_.sp(&mut cursor); if !self_.matcher(&mut cursor)? { bry!('matches self_.name(&mut cursor)?); self_.sp(&mut cursor); // NOTE: *optional* let _ = self_.matcher(&mut cursor)?; } self_.sp(&mut cursor); while self_.predicate(&mut cursor)? { } { let proto = self_.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::End); } self_.commit(); *s = cursor; true })) } /// sp <- spaces* /// /// (always succeeds, so doesn't return anything) fn sp(&mut self, s: &mut &'s str) { *s = s.trim_start_matches(|c: char| { matches!(c, ' ' | '\n' | '\r' | '\t') }); } /// identifier <- [A-Za-z_][A-Za-z0-9_]* fn identifier(&mut self, s: &mut &'s str) -> Result> { if let Some('A'..='Z') | Some('a'..='z') | Some('_') = s.chars().next() { *s = s.trim_start_matches(|c: char| { matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_') }); Ok(true) } else { Ok(false) } } /// name <- sp ( identifier -> Identifier ) sp fn name(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.consts.strings.iter().position(|c| c == name); let id = id.unwrap_or_else(|| { self.consts.strings.push(name.into()); self.consts.strings.len() - 1 }); let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::Identifier(id)); self.sp(&mut cursor); *s = cursor; true })) } /// parameter <- sp '$' ( '?'? -> MarkSkippable ) ( identifier -> Parameter ) sp fn parameter(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "$")); let skippable = strip_prefix(&mut cursor, "?"); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.obj_ids.get(name).copied().map_or_else( || { let (key, obj) = self.objs.as_mut().and_then(|map| { map.remove_entry(name) }).ok_or_else(|| { let pos = pos_of(self.base, start).unwrap(); PatternError::UnknownParameter(pos, name) })?; let id = self.consts.defs.len(); self.consts.defs.push(obj); self.obj_ids.insert(key, id); Ok(id) }, Ok, )?; let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::ParameterKey(id, skippable)); self.sp(&mut cursor); *s = cursor; true })) } /// predicate <- sp ':' ( '?'? -> MarkSkippable ) '$' ( identifier -> Predicate ) sp fn predicate(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, ":")); let skippable = strip_prefix(&mut cursor, "?"); bry!('matches strip_prefix(&mut cursor, "$")); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.pred_ids.get(name).copied().map_or_else( || { let (key, pred) = self.preds.as_mut().and_then(|map| { map.remove_entry(name) }).ok_or_else(|| { let pos = pos_of(self.base, start).unwrap(); PatternError::UnknownPredicate(pos, name) })?; let id = self.consts.predicates.len(); self.consts.predicates.push(pred); self.pred_ids.insert(key, id); Ok(id) }, Ok, )?; let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::ApplyPredicate(id, skippable)); self.sp(&mut cursor); *s = cursor; true })) } /// key_subtree <- sp '[' sp predicate* sp subtree sp ( ']' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable ) fn key_subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "[")); self.sp(&mut cursor); let mut subtree = SubtreeHelper::start(&mut *self); while subtree.predicate(&mut cursor)? { } subtree.sp(&mut cursor); bry!('matches subtree.subtree(&mut cursor)?); subtree.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "]") || subtree.unexpected_token(&mut cursor)? || subtree.unexpected_end(&mut cursor)? ); subtree.sp(&mut cursor); let skippable = strip_prefix(&mut cursor, "?"); *s = cursor; let id = subtree.commit(); let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::KeySubtree(id, skippable)); true })) } /// value_subtree <- sp '(' sp predicate* sp subtree sp ( ')' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable ) fn value_subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "(")); self.sp(&mut cursor); let mut subtree = SubtreeHelper::start(&mut *self); while subtree.predicate(&mut cursor)? { } subtree.sp(&mut cursor); bry!('matches subtree.subtree(&mut cursor)?); subtree.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, ")") || subtree.unexpected_token(&mut cursor)? || subtree.unexpected_end(&mut cursor)? ); subtree.sp(&mut cursor); let skippable = strip_prefix(&mut cursor, "?"); *s = cursor; let id = subtree.commit(); let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::ValueSubtree(id, skippable)); true })) } /// unexpected_end <- ( !. ) -> ErrorToken fn unexpected_end(&mut self, s: &mut &'s str) -> Result> { match *s { "" => Err(PatternError::Token(self.base.len(), s)), _ => Ok(false), } } /// unexpected_token <- . -> ErrorToken fn unexpected_token(&mut self, s: &mut &'s str) -> Result> { match *s { "" => Ok(false), _ => Err(PatternError::Token(pos_of(self.base, s).unwrap(), s)), } } /// subtree <- sp tag* sp ( value_subtree '' -> End )* sp fn subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; self.sp(&mut cursor); while self.tag(&mut cursor)? { } self.sp(&mut cursor); while self.value_subtree(&mut cursor)? { let proto = self.consts.protos.last_mut().expect("protos"); proto.push(PatternElement::End); } self.sp(&mut cursor); *s = cursor; // always matches (may match empty) Ok(true) } /// pattern <- ( subtree / unexpected_token ) ( !. / unexpected_token ) fn pattern(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { let mut subtree = SubtreeHelper::start(&mut *self); bry!('matches subtree.subtree(&mut cursor)? || subtree.unexpected_token(&mut cursor)? ); bry!('matches cursor == "" || subtree.unexpected_token(&mut cursor)? ); subtree.commit(); *s = cursor; true })) } } pub(crate) fn parse<'s, P, O, T>( input: &'s str, preds: Option>>>, objs: Option> ) -> Result, PatternError<'s>> where P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes, { let mut parser = Parser::<'s, P, O, T> { base: input, preds: preds, objs: objs, pred_ids: Default::default(), obj_ids: Default::default(), consts: Default::default(), closed_subtrees: (0..), }; let mut parsed = input; let matched = parser.pattern(&mut parsed)?; assert!(matched); assert_eq!(parsed, ""); assert_eq!(parser.closed_subtrees.next().unwrap(), parser.consts.protos.len()); Ok(parser.consts) } #[cfg(test)] mod tests { use crate::PatternTypes; use crate::RefOwn; use crate::KVPair; use crate::errors::PatternError; use super::Parser; use proptest::prelude::*; struct Dummy; impl PatternTypes for Dummy { type Ref = (); type Own = (); fn pairs<'b>( item: RefOwn<'b, Self::Ref, Self::Own> ) -> Option> + 'b>> { let _ = item; None } fn get<'a, 'b>( item: RefOwn<'b, Self::Ref, Self::Own>, key: RefOwn<'a, Self::Ref, Self::Own> ) -> Option>> { let _ = item; let _ = key; None } fn matches( left: RefOwn<'_, Self::Ref, Self::Own>, right: RefOwn<'_, Self::Ref, Self::Own> ) -> bool { let _ = left; let _ = right; false } fn as_str<'b>( item: RefOwn<'b, Self::Ref, Self::Own> ) -> Option<&'b str> { match item { RefOwn::Str(key) => Some(key), _ => None, } } } #[test] fn test_identifier() { fn identifier_input<'s>(s: &mut &'s str) -> Result> { let mut parser = Parser::<'s, &'static str, &'static str, Dummy> { base: *s, preds: None, objs: None, pred_ids: Default::default(), obj_ids: Default::default(), consts: Default::default(), closed_subtrees: (0..), }; parser.identifier(s) } for mut identifier in vec!["test", "Test", "_test", "_Test", "_123",] { println!("{}", identifier); assert!(identifier_input(&mut identifier).unwrap()); assert_eq!(identifier, ""); } for mut identifier in vec!["123", "1_", " ",] { println!("{}", identifier); assert!(!identifier_input(&mut identifier).unwrap()); assert_ne!(identifier, ""); } } proptest! { #[test] fn test_no_crash(s in ".{0,4096}") { fn prep_parser<'s>(s: &'s str) -> Parser<'s, &'static str, &'static str, Dummy> { let mut parser = Parser::<'s, &'static str, &'static str, Dummy> { base: s, preds: None, objs: None, pred_ids: Default::default(), obj_ids: Default::default(), consts: Default::default(), closed_subtrees: (0..), }; parser.consts.protos.push(Default::default()); parser } let _ = prep_parser(&s).str_literal(&mut &*s); let _ = prep_parser(&s).re_literal(&mut &*s); let _ = prep_parser(&s).matcher(&mut &*s); let _ = prep_parser(&s).tag(&mut &*s); let _ = prep_parser(&s).sp(&mut &*s); let _ = prep_parser(&s).identifier(&mut &*s); let _ = prep_parser(&s).name(&mut &*s); let _ = prep_parser(&s).parameter(&mut &*s); let _ = prep_parser(&s).predicate(&mut &*s); let _ = prep_parser(&s).key_subtree(&mut &*s); let _ = prep_parser(&s).value_subtree(&mut &*s); let _ = prep_parser(&s).unexpected_end(&mut &*s); let _ = prep_parser(&s).unexpected_token(&mut &*s); let _ = prep_parser(&s).subtree(&mut &*s); let _ = prep_parser(&s).pattern(&mut &*s); } } }