// Copyright (C) 2021-2022 Soni L. // SPDX-License-Identifier: MIT OR Apache-2.0 //! The recursive-descent datafu language parser. use std::borrow::Borrow; use std::collections::BTreeMap; use std::mem::ManuallyDrop; use impl_trait::impl_trait; use regex::Regex; use serde::Serialize; use crate::Predicate; use crate::errors::PatternError; use crate::vm; use crate::vm::PatternConstants; use crate::vm::PatternElement; use crate::vm::PatternToken; use crate::vm::Type; /// try! with bools. (the b comes from bool.) macro_rules! bry { ($l:lifetime $e:expr) => { if !$e { break $l false; } }; ($e:expr) => { if !$e { return false; } }; } // the following macros rely on unlabeled-break-through-labeled-block being an // error. // NOTE: always test changes to this module on nightly! // still waiting for label-break-value stabilization... #[cfg(not(feature = "stable"))] /// labeled block. on nightly: better compile errors. but also works on stable. macro_rules! lblock { // unfortunately ($l:lifetime : $b:block) => { $l: $b } didn't work. ($($t:tt)+) => { $($t)+ } } #[cfg(feature = "stable")] /// labeled block. on nightly: better compile errors. but also works on stable. macro_rules! lblock { ($l:lifetime : $b:block) => { $l: loop { break $b } } } /// Attempts to shift `s` forward by removing `prefix`. /// /// Returns whether `s` has had `prefix` removed. // can't use Pattern here :( fn strip_prefix(s: &mut &str, prefix: &str) -> bool { s.strip_prefix(prefix).map(|ns| *s = ns).is_some() } /// Returns the position (index) of `sub` within `base`, in bytes. /// /// Returns bogus results if `base` and `sub` are unrelated. fn pos_of<'a>(base: &'a str, sub: &'a str) -> Option { // FIXME is there any non-UB way to check if `sub` is in `base`? Some((sub.as_ptr() as usize) - (base.as_ptr() as usize)) } /// Collects value-ish PatternTokens into a PatternElement::Value with the /// given already-collected name. fn collect_value( name: Option, tokens: &[PatternToken], ) -> PatternElement { let value = match tokens { &[PatternToken::String(index, skippable)] => { vm::Value::String { index, skippable } }, &[PatternToken::Regex(index, skippable)] => { vm::Value::Regex { index, skippable } }, &[PatternToken::Type(ty, skippable)] => { vm::Value::Type { ty, skippable } }, other => { unreachable!("{other:?}") }, }; PatternElement::Value { name, value: Some(value), } } /// Collects a slice of PatternToken into a PatternElement::Value. fn collect_name_and_value(tokens: &[PatternToken]) -> PatternElement { match tokens { &[PatternToken::Identifier(name)] => { PatternElement::Value { name: Some(name), value: None, } }, &[PatternToken::Identifier(name), ref value @ ..] => { collect_value(Some(name), value) }, value => collect_value(None, value), } } /// Helper to collect "subtree" sections of the pattern. /// /// This is a RAII-like guard which handles cleaning up the parsed pattern when /// dropped. struct SubtreeHelper<'r, 's, PKey, OKey, O> where Self: 'r, PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { root: &'r mut Parser<'s, PKey, OKey, O>, } impl_trait! { impl<'r, 's, PKey, OKey, O> SubtreeHelper<'r, 's, PKey, OKey, O> where Self: 'r, PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { fn start(value: &'r mut Parser<'s, PKey, OKey, O>) -> Self { value.consts.protos.push(Default::default()); Self { root: value, } } fn commit(self) -> usize { let mut self_ = ManuallyDrop::new(self); let proto = self_.root.consts.protos.pop().unwrap(); let id = self_.root.closed_subtrees.next().unwrap(); self_.root.consts.protos.insert(id, proto); id } impl trait std::ops::Deref { type Target = Parser<'s, PKey, OKey, O>; fn deref(&self) -> &Self::Target { &*self.root } } impl trait std::ops::DerefMut { fn deref_mut(&mut self) -> &mut Self::Target { self.root } } impl trait Drop { fn drop(&mut self) { // remove "partial" proto self.root.consts.protos.pop().expect("SubtreeHelper"); } } } } /// Helper to collect "tag" sections of the pattern. /// /// This is a RAII-like guard which handles cleaning up the parsed pattern when /// dropped. struct TagHelper<'r, 's, PKey, OKey, O> where Self: 'r, PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { root: &'r mut Parser<'s, PKey, OKey, O>, len: usize, } impl_trait! { impl<'r, 's, PKey, OKey, O> TagHelper<'r, 's, PKey, OKey, O> where Self: 'r, PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { fn start(value: &'r mut Parser<'s, PKey, OKey, O>) -> Self { let len = value.tokens.len(); Self { root: value, len : len, } } fn commit(self) { let _self = &mut *std::mem::ManuallyDrop::new(self); // we could write a proper parser for the token stream. // // we could also just do this instead. match _self.root.tokens.drain(_self.len..).as_slice() { &[ PatternToken::Arrow, PatternToken::KeySubtree(index), ref name_value @ .., PatternToken::End, ] => { let tag = PatternElement::Tag { key_subtree: Some(index), }; _self.root.consts.protos.last_mut().unwrap().push(tag); let value = collect_name_and_value(name_value); _self.root.consts.protos.last_mut().unwrap().push(value); }, &[ PatternToken::Arrow, ref name_value @ .., PatternToken::End, ] => { let tag = PatternElement::Tag { key_subtree: None, }; _self.root.consts.protos.last_mut().unwrap().push(tag); let value = collect_name_and_value(name_value); _self.root.consts.protos.last_mut().unwrap().push(value); }, other => { unreachable!("{other:?}"); }, }; } impl trait std::ops::Deref { type Target = Parser<'s, PKey, OKey, O>; fn deref(&self) -> &Self::Target { &*self.root } } impl trait std::ops::DerefMut { fn deref_mut(&mut self) -> &mut Self::Target { self.root } } impl trait Drop { fn drop(&mut self) { let proto = &mut self.root.tokens; assert!(proto.len() >= self.len); proto.drain(self.len..); } } } } struct Parser<'s, PKey, OKey, O> where PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { base: &'s str, preds: Option>>, objs: Option>, pred_ids: BTreeMap, obj_ids: BTreeMap, consts: PatternConstants, tokens: Vec, closed_subtrees: std::ops::RangeFrom, } // These are documented using LPeg.re syntax // http://www.inf.puc-rio.br/~roberto/lpeg/re.html #[rustfmt::skip] impl<'s, PKey, OKey, O> Parser<'s, PKey, OKey, O> where PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { /// Creates a new `Parser`. fn new( base: &'s str, preds: Option>>, objs: Option>, ) -> Self { Self { base: base, preds: preds, objs: objs, tokens: Default::default(), pred_ids: Default::default(), obj_ids: Default::default(), consts: Default::default(), closed_subtrees: (0..), } } /// str_literal <- sp ( ( "'" str_char* ( "'" / ( !. -> ErrorStrEnd ) ) ( '?' -> MarkSkippable ) ) -> String ) sp /// str_char <- ( str_escape / [^%'] ) /// str_escape <- '%' ( '%' / "'" ) / ( ( '%' .? ) -> ErrorStrEscape ) fn str_literal(&mut self, s: &mut &'s str) -> Result> { const STRING_TOKEN: &str = "'"; let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, STRING_TOKEN)); let mut string = String::new(); loop { let base = cursor; if strip_prefix(&mut cursor, "%") { if strip_prefix(&mut cursor, STRING_TOKEN) { string.push_str(STRING_TOKEN); } else if strip_prefix(&mut cursor, "%") { string.push_str("%"); } else { return Err(PatternError::StringEscape(pos_of(self.base, base).unwrap(), cursor)); } } else { cursor = cursor.trim_start_matches(|c: char| { !matches!(c, '%' | '\'') }); if base == cursor { // no match? break; } string.push_str(&base[..pos_of(base, cursor).unwrap()]); } } if !strip_prefix(&mut cursor, STRING_TOKEN) { match cursor { "" => return Err(PatternError::StringEnd(self.base.len(), cursor)), _ => unreachable!(), } } let skippable = strip_prefix(&mut cursor, "?"); self.sp(&mut cursor); let id = self.consts.strings.iter().position(|c| c == &string); let self_ = &mut *self; let id = id.unwrap_or_else(move || { string.shrink_to_fit(); self_.consts.strings.push(string); self_.consts.strings.len() - 1 }); let proto = &mut self.tokens; proto.push(PatternToken::String(id, skippable)); *s = cursor; true })) } /// re_literal <- sp ( ( "/" re_char* ( "/" / ( !. -> ErrorReEnd ) ) ( '?' -> MarkSkippable ) ) -> Regex ) sp /// re_char <- ( re_escape / [^%/] ) /// re_escape <- '%' ( '%' / "/" ) / ( ( '%' .? ) -> ErrorReEscape ) #[allow(unused_variables)] #[allow(unreachable_code)] fn re_literal(&mut self, s: &mut &'s str) -> Result> { const RE_TOKEN: &str = "/"; let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); let orig = cursor; bry!('matches strip_prefix(&mut cursor, RE_TOKEN)); let mut string = String::new(); loop { let base = cursor; if strip_prefix(&mut cursor, "%") { if strip_prefix(&mut cursor, RE_TOKEN) { string.push_str(RE_TOKEN); } else if strip_prefix(&mut cursor, "%") { string.push_str("%"); } else { return Err(PatternError::RegexEscape(pos_of(self.base, base).unwrap(), cursor)); } } else { cursor = cursor.trim_start_matches(|c: char| { !matches!(c, '%' | '/') }); if base == cursor { // no match? break; } string.push_str(&base[..pos_of(base, cursor).unwrap()]); } } if !strip_prefix(&mut cursor, RE_TOKEN) { match cursor { "" => return Err(PatternError::RegexEnd(self.base.len(), cursor)), _ => unreachable!(), } } let end = cursor; let skippable = strip_prefix(&mut cursor, "?"); self.sp(&mut cursor); let re = Regex::new(&string).map_err(|e| { PatternError::Regex(pos_of(self.base, orig).unwrap(), &orig[..pos_of(orig, end).unwrap()], e) })?; let id = self.consts.regices.iter().position(|c| c.as_str() == re.as_str()); let self_ = &mut *self; let id = id.unwrap_or_else(move || { self_.consts.regices.push(re); self_.consts.regices.len() - 1 }); let proto = &mut self.tokens; proto.push(PatternToken::Regex(id, skippable)); *s = cursor; true })) } /// matcher <- sp ( parameter / str_literal / re_literal / predicate / ty ) sp fn matcher(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); match () { _ if self.parameter(&mut cursor)? => {}, _ if self.str_literal(&mut cursor)? => {}, _ if self.re_literal(&mut cursor)? => {}, _ if self.predicate(&mut cursor)? => {}, _ if self.ty(&mut cursor)? => {}, _ => bry!('matches false), } self.sp(&mut cursor); *s = cursor; true })) } /// tag <- sp ( '->' -> Arrow ) sp key_subtree? sp ( matcher / name sp matcher? ) ( '' -> End ) sp fn tag(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "->")); let mut self_ = TagHelper::start(&mut *self); { let proto = &mut self_.tokens; proto.push(PatternToken::Arrow); } self_.sp(&mut cursor); let _ = self_.key_subtree(&mut cursor)?; self_.sp(&mut cursor); if !self_.matcher(&mut cursor)? { bry!('matches self_.name(&mut cursor)?); self_.sp(&mut cursor); let _ = self_.matcher(&mut cursor)?; } self_.sp(&mut cursor); { let proto = &mut self_.tokens; proto.push(PatternToken::End); } self_.commit(); *s = cursor; true })) } /// sp <- spaces* /// /// (always succeeds, so doesn't return anything) fn sp(&mut self, s: &mut &'s str) { *s = s.trim_start_matches(|c: char| { matches!(c, ' ' | '\n' | '\r' | '\t') }); } /// identifier <- [A-Za-z_][A-Za-z0-9_]* fn identifier(&mut self, s: &mut &'s str) -> Result> { if let Some('A'..='Z') | Some('a'..='z') | Some('_') = s.chars().next() { *s = s.trim_start_matches(|c: char| { matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_') }); Ok(true) } else { Ok(false) } } /// name <- sp ( identifier -> Identifier ) sp fn name(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.consts.strings.iter().position(|c| c == name); let id = id.unwrap_or_else(|| { self.consts.strings.push(name.into()); self.consts.strings.len() - 1 }); let proto = &mut self.tokens; proto.push(PatternToken::Identifier(id)); self.sp(&mut cursor); *s = cursor; true })) } /// parameter <- sp '$' ( '?'? -> MarkSkippable ) ( identifier -> Parameter ) sp fn parameter(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "$")); let skippable = strip_prefix(&mut cursor, "?"); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.obj_ids.get(name).copied().map_or_else( || { let (key, obj) = self.objs.as_mut().and_then(|map| { map.remove_entry(name) }).ok_or_else(|| { let pos = pos_of(self.base, start).unwrap(); PatternError::UnknownParameter(pos, name) })?; let id = self.consts.defs.len(); self.consts.defs.push(obj); self.obj_ids.insert(key, id); Ok(id) }, Ok, )?; let proto = &mut self.tokens; proto.push(PatternToken::Parameter(id, skippable)); self.sp(&mut cursor); *s = cursor; true })) } /// ty <- sp ':' ( '?'? -> MarkSkippable ) ( identifier -> Type ) sp fn ty(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, ":")); let custom = strip_prefix(&mut cursor, "$"); let skippable = strip_prefix(&mut cursor, "?"); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; let proto = &mut self.tokens; proto.push(PatternToken::Type(match name { "bool" => Type::Bool, "i8" => Type::I8, "i16" => Type::I16, "i32" => Type::I32, "i64" => Type::I64, "i128" => Type::I128, "u8" => Type::U8, "u16" => Type::U16, "u32" => Type::U32, "u64" => Type::U64, "u128" => Type::U128, "f32" => Type::F32, "f64" => Type::F64, "char" => Type::Char, "str" => Type::Str, "string" => Type::String, "bytes" => Type::Bytes, "bytebuf" => Type::ByteBuf, "option" => Type::Option, "unit" => Type::Unit, "seq" => Type::Seq, "map" => Type::Map, //"tuple" => Type::Tuple(usize), _ => { let pos = pos_of(self.base, start).unwrap(); return Err(PatternError::UnknownPredicate(pos, name)) } }, skippable)); self.sp(&mut cursor); *s = cursor; true })) } /// predicate <- sp ':' '$' ( '?'? -> MarkSkippable ) ( identifier -> Predicate ) sp fn predicate(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, ":")); bry!('matches strip_prefix(&mut cursor, "$")); let skippable = strip_prefix(&mut cursor, "?"); let start = cursor; bry!('matches self.identifier(&mut cursor)?); let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; // no processing of `name` is required for this. let id = self.pred_ids.get(name).copied().map_or_else( || { let (key, pred) = self.preds.as_mut().and_then(|map| { map.remove_entry(name) }).ok_or_else(|| { let pos = pos_of(self.base, start).unwrap(); PatternError::UnknownPredicate(pos, name) })?; let id = self.consts.predicates.len(); self.consts.predicates.push(pred); self.pred_ids.insert(key, id); Ok(id) }, Ok, )?; let proto = &mut self.tokens; proto.push(PatternToken::ApplyPredicate(id, skippable)); self.sp(&mut cursor); *s = cursor; let pos = pos_of(self.base, start).unwrap(); return Err(PatternError::Unimplemented(pos, cursor)); //true })) } /// key_subtree <- sp '[' sp ( matcher / name sp matcher? ) sp subtree sp ( ']' / unexpected_token / unexpected_end ) sp // ( '?'? -> MarkSkippable ) fn key_subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "[")); self.sp(&mut cursor); let mut subtree = SubtreeHelper::start(&mut *self); // FIXME handle `?` let marker = subtree.tokens.len(); if !subtree.matcher(&mut cursor)? { bry!('matches subtree.name(&mut cursor)?); subtree.sp(&mut cursor); let _ = subtree.matcher(&mut cursor)?; } let value = match subtree.tokens.drain(marker..).as_slice() { name_value => collect_name_and_value(name_value), }; subtree.consts.protos.last_mut().unwrap().push(value); subtree.sp(&mut cursor); bry!('matches subtree.subtree(&mut cursor)?); subtree.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "]") || subtree.unexpected_token(&mut cursor)? || subtree.unexpected_end(&mut cursor)? ); subtree.sp(&mut cursor); //let skippable = strip_prefix(&mut cursor, "?"); *s = cursor; let id = subtree.commit(); let proto = &mut self.tokens; proto.push(PatternToken::KeySubtree(id)); true })) } /// value_subtree <- sp '(' sp subtree sp ( ')' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable ) fn value_subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { self.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, "(")); self.sp(&mut cursor); let mut subtree = SubtreeHelper::start(&mut *self); bry!('matches subtree.subtree(&mut cursor)?); subtree.sp(&mut cursor); bry!('matches strip_prefix(&mut cursor, ")") || subtree.unexpected_token(&mut cursor)? || subtree.unexpected_end(&mut cursor)? ); subtree.sp(&mut cursor); let skippable = strip_prefix(&mut cursor, "?"); *s = cursor; let id = subtree.commit(); let proto = &mut self.tokens; proto.push(PatternToken::ValueSubtree(id, skippable)); true })) } /// unexpected_end <- ( !. ) -> ErrorToken fn unexpected_end(&mut self, s: &mut &'s str) -> Result> { match *s { "" => Err(PatternError::Token(self.base.len(), s)), _ => Ok(false), } } /// unexpected_token <- . -> ErrorToken fn unexpected_token(&mut self, s: &mut &'s str) -> Result> { match *s { "" => Ok(false), _ => Err(PatternError::Token(pos_of(self.base, s).unwrap(), s)), } } /// subtree <- sp tag* sp ( value_subtree '' -> End )* sp fn subtree(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; self.sp(&mut cursor); while self.tag(&mut cursor)? { } self.sp(&mut cursor); while self.value_subtree(&mut cursor)? { let proto = &mut self.tokens; proto.push(PatternToken::End); } self.sp(&mut cursor); *s = cursor; // always matches (may match empty) Ok(true) } /// pattern <- ( matcher? sp subtree / unexpected_token ) ( !. / unexpected_token ) fn pattern(&mut self, s: &mut &'s str) -> Result> { let mut cursor = *s; Ok(lblock!('matches: { let mut subtree = SubtreeHelper::start(&mut *self); // FIXME handle `?` let marker = subtree.tokens.len(); let _ = subtree.matcher(&mut cursor)?; let value = match subtree.tokens.drain(marker..).as_slice() { &[] => { PatternElement::Value { name: None, value: None, } }, value => collect_value(None, value), }; subtree.consts.protos.last_mut().unwrap().push(value); bry!('matches subtree.subtree(&mut cursor)? || subtree.unexpected_token(&mut cursor)? ); bry!('matches cursor == "" || subtree.unexpected_token(&mut cursor)? ); subtree.commit(); *s = cursor; true })) } } /// Parses a DFU expression. /// /// The given pub(crate) fn parse<'s, PKey, OKey, O>( input: &'s str, preds: Option>>, objs: Option> ) -> Result, PatternError<'s>> where PKey: Borrow + Ord, OKey: Borrow + Ord, O: Serialize, { let mut parser = Parser::new(input, preds, objs); let mut parsed = input; let matched = parser.pattern(&mut parsed)?; assert!(matched); assert_eq!(parsed, ""); assert_eq!( parser.closed_subtrees.next().unwrap(), parser.consts.protos.len(), ); assert!(parser.consts.protos.iter().all(|proto| !proto.is_empty())); Ok(parser.consts) } #[cfg(test)] mod tests { use crate::errors::PatternError; use super::Parser; use proptest::prelude::*; fn prep_parser<'s>(s: &'s str) -> Parser<'s, &'static str, &'static str, ()> { let mut parser = Parser::< 's, &'static str, &'static str, () >::new(s, None, None); parser } #[test] fn test_identifier() { fn identifier_input<'s>(s: &mut &'s str) -> Result> { let mut parser = Parser::< 's, &'static str, &'static str, () >::new(s, None, None); parser.identifier(s) } for mut identifier in vec!["test", "Test", "_test", "_Test", "_123",] { println!("{}", identifier); assert!(identifier_input(&mut identifier).unwrap()); assert_eq!(identifier, ""); } for mut identifier in vec!["123", "1_", " ",] { println!("{}", identifier); assert!(!identifier_input(&mut identifier).unwrap()); assert_ne!(identifier, ""); } } proptest! { #[test] fn test_no_crash(s in ".{0,4096}") { let _ = prep_parser(&s).str_literal(&mut &*s); let _ = prep_parser(&s).re_literal(&mut &*s); let _ = prep_parser(&s).matcher(&mut &*s); let _ = prep_parser(&s).tag(&mut &*s); let _ = prep_parser(&s).sp(&mut &*s); let _ = prep_parser(&s).identifier(&mut &*s); let _ = prep_parser(&s).name(&mut &*s); let _ = prep_parser(&s).parameter(&mut &*s); let _ = prep_parser(&s).predicate(&mut &*s); let _ = prep_parser(&s).ty(&mut &*s); let _ = prep_parser(&s).key_subtree(&mut &*s); let _ = prep_parser(&s).value_subtree(&mut &*s); let _ = prep_parser(&s).unexpected_end(&mut &*s); let _ = prep_parser(&s).unexpected_token(&mut &*s); let _ = prep_parser(&s).subtree(&mut &*s); let _ = prep_parser(&s).pattern(&mut &*s); } } #[test] fn test_no_crash_some_patterns() { fn run_pattern(mut s: &str) { let _ = prep_parser(s).pattern(&mut s); } run_pattern("hello"); run_pattern("/test/"); run_pattern("'this'"); run_pattern(":map"); run_pattern(":?map"); run_pattern(":map->[:str]:str"); } }