From 69652efe8ad9738a94fef571c8b81e342f96e7b4 Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Sun, 7 Feb 2021 22:19:21 -0300 Subject: Finish porting parser --- src/parser.rs | 669 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 665 insertions(+), 4 deletions(-) (limited to 'src/parser.rs') diff --git a/src/parser.rs b/src/parser.rs index e2caa4f..00fbcb1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -16,13 +16,674 @@ * along with this program. If not, see . */ -// TODO +use std::borrow::Borrow; +use std::collections::BTreeMap; +use std::mem::ManuallyDrop; + +use boolinator::Boolinator; +use regex::Regex; use crate::PatternTypes; +use crate::Predicate; use crate::errors::PatternError; use crate::vm::PatternConstants; -//use crate::vm::PatternElement; +use crate::vm::PatternElement; + +/// try! with bools. (the b comes from bool.) +macro_rules! bry { + ($l:lifetime $e:expr) => { + if !$e { + break $l false; + } + }; + ($e:expr) => { + if !$e { + return false; + } + }; +} + +// the following macros rely on unlabeled-break-through-labeled-block being an +// error. +// NOTE: always test changes to this module on nightly! + +#[cfg(not(feature = "stable"))] +/// labeled block. on nightly: better compile errors. but also works on stable. +macro_rules! lblock { + // unfortunately ($l:lifetime : $b:block) => { $l: $b } didn't work. + ($($t:tt)+) => { + $($t)+ + } +} + +#[cfg(feature = "stable")] +/// labeled block. on nightly: better compile errors. but also works on stable. +macro_rules! lblock { + ($l:lifetime : $b:block) => { + $l: loop { + break $b + } + } +} + +// can't use Pattern here :( +fn strip_prefix(s: &mut &str, prefix: &str) -> bool { + s.strip_prefix(prefix).map(|ns| *s = ns).is_some() +} + +fn pos_of<'a>(base: &'a str, sub: &'a str) -> Option { + // FIXME + Some((sub.as_ptr() as usize) - (base.as_ptr() as usize)) +} + +struct SubtreeHelper<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> where Self: 'r { + root: &'r mut Parser<'s, P, O, T>, +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> SubtreeHelper<'r, 's, P, O, T> where Self: 'r { + fn start(value: &'r mut Parser<'s, P, O, T>) -> Self { + value.consts.protos.push(Default::default()); + Self { + root: value, + } + } + + fn commit(mut self) -> usize { + let mut self_ = ManuallyDrop::new(self); + let proto = self_.root.consts.protos.pop().unwrap(); + let id = self_.root.closed_subtrees.next().unwrap(); + self_.root.consts.protos.insert(id, proto); + id + } +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> std::ops::Deref for SubtreeHelper<'r, 's, P, O, T> where Self: 'r { + type Target = Parser<'s, P, O, T>; + + fn deref(&self) -> &Self::Target { + &*self.root + } +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> std::ops::DerefMut for SubtreeHelper<'r, 's, P, O, T> where Self: 'r { + fn deref_mut(&mut self) -> &mut Self::Target { + self.root + } +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> Drop for SubtreeHelper<'r, 's, P, O, T> where Self: 'r { + fn drop(&mut self) { + // remove "partial" proto + self.root.consts.protos.pop().expect("SubtreeHelper"); + } +} + +struct TagHelper<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> where Self: 'r { + root: &'r mut Parser<'s, P, O, T>, + len: usize, +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> TagHelper<'r, 's, P, O, T> where Self: 'r { + fn start(value: &'r mut Parser<'s, P, O, T>) -> Self { + let len = value.consts.protos.last().unwrap().len(); + Self { + root: value, + len : len, + } + } + + fn commit(mut self) { + std::mem::forget(self) + } +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> std::ops::Deref for TagHelper<'r, 's, P, O, T> where Self: 'r { + type Target = Parser<'s, P, O, T>; + + fn deref(&self) -> &Self::Target { + &*self.root + } +} + +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> std::ops::DerefMut for TagHelper<'r, 's, P, O, T> where Self: 'r { + fn deref_mut(&mut self) -> &mut Self::Target { + self.root + } +} -pub(crate) fn parse(s: &str, defs: Option<()>/*TODO*/) -> Result, PatternError> { - unimplemented!() +impl<'r, 's, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> Drop for TagHelper<'r, 's, P, O, T> where Self: 'r { + fn drop(&mut self) { + let proto = self.root.consts.protos.last_mut().unwrap(); + assert!(proto.len() >= self.len); + while proto.len() > self.len { + let _ = proto.pop(); + } + } } + +struct Parser<'s, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> { + base: &'s str, + preds: Option>>>, + objs: Option>, + pred_ids: BTreeMap, + obj_ids: BTreeMap, + consts: PatternConstants, + closed_subtrees: std::ops::RangeFrom, +} + +// These are documented using LPeg.re syntax +// http://www.inf.puc-rio.br/~roberto/lpeg/re.html +#[rustfmt::skip] +impl<'s, P: Borrow + Ord, O: Borrow + Ord, T: PatternTypes> Parser<'s, P, O, T> { + /// str_literal <- sp ( ( "'" str_char* ( "'" / ( !. -> ErrorStrEnd ) ) ( '?' -> MarkSkippable ) ) -> String ) sp + /// str_char <- ( str_escape / [^%'] ) + /// str_escape <- '%' ( '%' / "'" ) / ( ( '%' .? ) -> ErrorStrEscape ) + fn str_literal(&mut self, s: &mut &'s str) -> Result> { + const STRING_TOKEN: &str = "'"; + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, STRING_TOKEN)); + let mut string = String::new(); + loop { + let base = cursor; + if strip_prefix(&mut cursor, "%") { + if strip_prefix(&mut cursor, STRING_TOKEN) { + string.push_str(STRING_TOKEN); + } else if strip_prefix(&mut cursor, "%") { + string.push_str("%"); + } else { + return Err(PatternError::StringEscape(pos_of(self.base, base).unwrap(), cursor)); + } + } else { + cursor = cursor.trim_start_matches(|c: char| { + !matches!(c, '%' | '\'') + }); + if base == cursor { // no match? + break; + } + string.push_str(&base[..pos_of(base, cursor).unwrap()]); + } + } + if !strip_prefix(&mut cursor, STRING_TOKEN) { + match cursor { + "" => return Err(PatternError::StringEnd(self.base.len(), cursor)), + _ => unreachable!(), + } + } + let skippable = strip_prefix(&mut cursor, "?"); + self.sp(&mut cursor); + let id = self.consts.strings.iter().position(|c| c == &string); + let self_ = &mut *self; + let id = id.unwrap_or_else(move || { + string.shrink_to_fit(); + self_.consts.strings.push(string); + self_.consts.strings.len() - 1 + }); + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::StringKey(id, skippable)); + *s = cursor; + true + })) + } + + /// re_literal <- sp ( ( "/" re_char* ( "/" / ( !. -> ErrorReEnd ) ) ( '?' -> MarkSkippable ) ) -> Regex ) sp + /// re_char <- ( re_escape / [^%/] ) + /// re_escape <- '%' ( '%' / "/" ) / ( ( '%' .? ) -> ErrorReEscape ) + #[allow(unused_variables)] + #[allow(unreachable_code)] + fn re_literal(&mut self, s: &mut &'s str) -> Result> { + const RE_TOKEN: &str = "/"; + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + let orig = cursor; + bry!('matches strip_prefix(&mut cursor, RE_TOKEN)); + let mut string = String::new(); + loop { + let base = cursor; + if strip_prefix(&mut cursor, "%") { + if strip_prefix(&mut cursor, RE_TOKEN) { + string.push_str(RE_TOKEN); + } else if strip_prefix(&mut cursor, "%") { + string.push_str("%"); + } else { + return Err(PatternError::RegexEscape(pos_of(self.base, base).unwrap(), cursor)); + } + } else { + cursor = cursor.trim_start_matches(|c: char| { + !matches!(c, '%' | '/') + }); + if base == cursor { // no match? + break; + } + string.push_str(&base[..pos_of(base, cursor).unwrap()]); + } + } + if !strip_prefix(&mut cursor, RE_TOKEN) { + match cursor { + "" => return Err(PatternError::RegexEnd(self.base.len(), cursor)), + _ => unreachable!(), + } + } + let end = cursor; + let skippable = strip_prefix(&mut cursor, "?"); + self.sp(&mut cursor); + let re = Regex::new(&string).map_err(|e| { + PatternError::Regex(pos_of(self.base, orig).unwrap(), &orig[..pos_of(orig, end).unwrap()], e) + })?; + let id = self.consts.regices.iter().position(|c| c.as_str() == re.as_str()); + let self_ = &mut *self; + let id = id.unwrap_or_else(move || { + self_.consts.regices.push(re); + self_.consts.regices.len() - 1 + }); + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::RegexKey(id, skippable)); + *s = cursor; + true + })) + } + + /// matcher <- sp ( parameter / str_literal / re_literal / key_subtree ) sp + fn matcher(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + match () { + _ if self.parameter(&mut cursor)? => {}, + _ if self.str_literal(&mut cursor)? => {}, + _ if self.re_literal(&mut cursor)? => {}, + _ if self.key_subtree(&mut cursor)? => {}, + _ => bry!('matches false), + } + self.sp(&mut cursor); + *s = cursor; + true + })) + } + + /// tag <- sp ( '->' -> Arrow ) sp ( matcher / name sp matcher? ) sp predicate* ( '' -> End ) sp + fn tag(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, "->")); + let mut self_ = TagHelper::start(&mut *self); + { + let proto = self_.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::Arrow); + } + self_.sp(&mut cursor); + if !self_.matcher(&mut cursor)? { + bry!('matches self_.name(&mut cursor)?); + self_.sp(&mut cursor); + // NOTE: *optional* + let _ = self_.matcher(&mut cursor)?; + } + self_.sp(&mut cursor); + while self_.predicate(&mut cursor)? { + } + { + let proto = self_.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::End); + } + self_.commit(); + *s = cursor; + true + })) + } + + /// sp <- spaces* + /// + /// (always succeeds, so doesn't return anything) + fn sp(&mut self, s: &mut &'s str) { + *s = s.trim_start_matches(|c: char| { + matches!(c, ' ' | '\n' | '\r' | '\t') + }); + } + + /// identifier <- [A-Za-z_][A-Za-z0-9_]* + fn identifier(&mut self, s: &mut &'s str) -> Result> { + if let Some('A'..='Z') | Some('a'..='z') | Some('_') = s.chars().next() { + *s = s.trim_start_matches(|c: char| { + matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_') + }); + Ok(true) + } else { + Ok(false) + } + } + + /// name <- sp ( identifier -> Identifier ) sp + fn name(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + let start = cursor; + bry!('matches self.identifier(&mut cursor)?); + let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; + // no processing of `name` is required for this. + let id = self.consts.strings.iter().position(|c| c == name); + let id = id.unwrap_or_else(|| { + self.consts.strings.push(name.into()); + self.consts.strings.len() - 1 + }); + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::Identifier(id)); + self.sp(&mut cursor); + *s = cursor; + true + })) + } + + /// parameter <- sp '$' ( '?'? -> MarkSkippable ) ( identifier -> Parameter ) sp + fn parameter(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, "$")); + let skippable = strip_prefix(&mut cursor, "?"); + let start = cursor; + bry!('matches self.identifier(&mut cursor)?); + let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; + // no processing of `name` is required for this. + let id = self.obj_ids.get(name).copied().map_or_else( + || { + let (key, obj) = self.objs.as_mut().and_then(|map| { + map.remove_entry(name) + }).ok_or_else(|| { + let pos = pos_of(self.base, start).unwrap(); + PatternError::UnknownParameter(pos, name) + })?; + let id = self.consts.defs.len(); + self.consts.defs.push(obj); + self.obj_ids.insert(key, id); + Ok(id) + }, + Ok, + )?; + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::ParameterKey(id, skippable)); + self.sp(&mut cursor); + *s = cursor; + true + })) + } + + /// predicate <- sp ':' ( '?'? -> MarkSkippable ) '$' ( identifier -> Predicate ) sp + fn predicate(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, ":")); + let skippable = strip_prefix(&mut cursor, "?"); + bry!('matches strip_prefix(&mut cursor, "$")); + let start = cursor; + bry!('matches self.identifier(&mut cursor)?); + let name = &start[..pos_of(start, cursor).unwrap_or(start.len())]; + // no processing of `name` is required for this. + let id = self.pred_ids.get(name).copied().map_or_else( + || { + let (key, pred) = self.preds.as_mut().and_then(|map| { + map.remove_entry(name) + }).ok_or_else(|| { + let pos = pos_of(self.base, start).unwrap(); + PatternError::UnknownPredicate(pos, name) + })?; + let id = self.consts.predicates.len(); + self.consts.predicates.push(pred); + self.pred_ids.insert(key, id); + Ok(id) + }, + Ok, + )?; + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::ApplyPredicate(id, skippable, std::marker::PhantomData)); + self.sp(&mut cursor); + *s = cursor; + true + })) + } + + /// key_subtree <- sp '[' sp predicate* sp subtree sp ( ']' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable ) + fn key_subtree(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, "[")); + self.sp(&mut cursor); + let mut subtree = SubtreeHelper::start(&mut *self); + while subtree.predicate(&mut cursor)? { + } + subtree.sp(&mut cursor); + bry!('matches subtree.subtree(&mut cursor)?); + subtree.sp(&mut cursor); + bry!('matches + strip_prefix(&mut cursor, "]") + || + subtree.unexpected_token(&mut cursor)? + || + subtree.unexpected_end(&mut cursor)? + ); + subtree.sp(&mut cursor); + let skippable = strip_prefix(&mut cursor, "?"); + *s = cursor; + let id = subtree.commit(); + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::KeySubtree(id, skippable)); + true + })) + } + + /// value_subtree <- sp '(' sp predicate* sp subtree sp ( ')' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable ) + fn value_subtree(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + self.sp(&mut cursor); + bry!('matches strip_prefix(&mut cursor, "(")); + self.sp(&mut cursor); + let mut subtree = SubtreeHelper::start(&mut *self); + while subtree.predicate(&mut cursor)? { + } + subtree.sp(&mut cursor); + bry!('matches subtree.subtree(&mut cursor)?); + subtree.sp(&mut cursor); + bry!('matches + strip_prefix(&mut cursor, ")") + || + subtree.unexpected_token(&mut cursor)? + || + subtree.unexpected_end(&mut cursor)? + ); + subtree.sp(&mut cursor); + let skippable = strip_prefix(&mut cursor, "?"); + *s = cursor; + let id = subtree.commit(); + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::ValueSubtree(id, skippable)); + true + })) + } + + /// unexpected_end <- ( !. ) -> ErrorToken + fn unexpected_end(&mut self, s: &mut &'s str) -> Result> { + match *s { + "" => Err(PatternError::Token(self.base.len(), s)), + _ => Ok(false), + } + } + + /// unexpected_token <- . -> ErrorToken + fn unexpected_token(&mut self, s: &mut &'s str) -> Result> { + match *s { + "" => Ok(false), + _ => Err(PatternError::Token(pos_of(self.base, s).unwrap(), s)), + } + } + + /// subtree <- sp tag* sp ( value_subtree '' -> End )* sp + fn subtree(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + self.sp(&mut cursor); + while self.tag(&mut cursor)? { + } + self.sp(&mut cursor); + while self.value_subtree(&mut cursor)? { + let proto = self.consts.protos.last_mut().expect("protos"); + proto.push(PatternElement::End); + } + self.sp(&mut cursor); + *s = cursor; + // always matches (may match empty) + Ok(true) + } + + /// pattern <- ( subtree / unexpected_token ) ( !. / unexpected_token ) + fn pattern(&mut self, s: &mut &'s str) -> Result> { + let mut cursor = *s; + Ok(lblock!('matches: { + let mut subtree = SubtreeHelper::start(&mut *self); + bry!('matches + subtree.subtree(&mut cursor)? + || + subtree.unexpected_token(&mut cursor)? + ); + bry!('matches + cursor == "" + || + subtree.unexpected_token(&mut cursor)? + ); + subtree.commit(); + *s = cursor; + true + })) + } +} + +pub(crate) fn parse<'s, P, O, T>( + input: &'s str, + preds: Option>>>, + objs: Option> +) -> Result, PatternError<'s>> + where + P: Borrow + Ord, + O: Borrow + Ord, + T: PatternTypes, +{ + let mut parser = Parser::<'s, P, O, T> { + base: input, + preds: preds, + objs: objs, + pred_ids: Default::default(), + obj_ids: Default::default(), + consts: Default::default(), + closed_subtrees: (0..), + }; + + let mut parsed = input; + parser.pattern(&mut parsed)?.expect(""); + + assert_eq!(parsed, ""); + assert_eq!(parser.closed_subtrees.next().unwrap(), parser.consts.protos.len()); + + Ok(parser.consts) +} + +#[cfg(test)] +mod tests { + use crate::PatternTypes; + use crate::RefOwn; + use crate::KVPair; + use crate::errors::PatternError; + use super::Parser; + + use proptest::prelude::*; + + struct Dummy; + impl PatternTypes for Dummy { + type Ref = (); + type Own = (); + fn pairs<'b>( + item: RefOwn<'b, Self::Ref, Self::Own> + ) -> Option> + 'b>> { + None + } + + fn get<'a, 'b>( + item: RefOwn<'b, Self::Ref, Self::Own>, + key: RefOwn<'a, Self::Ref, Self::Own> + ) -> Option>> { + None + } + + fn matches( + left: RefOwn<'_, Self::Ref, Self::Own>, + right: RefOwn<'_, Self::Ref, Self::Own> + ) -> bool { + false + } + } + + #[test] + fn test_identifier() { + fn identifier_input<'s>(s: &mut &'s str) -> Result> { + let mut parser = Parser::<'s, &'static str, &'static str, Dummy> { + base: *s, + preds: None, + objs: None, + pred_ids: Default::default(), + obj_ids: Default::default(), + consts: Default::default(), + closed_subtrees: (0..), + }; + parser.identifier(s) + } + for mut identifier in vec!["test", "Test", "_test", "_Test", "_123",] { + println!("{}", identifier); + assert!(identifier_input(&mut identifier).unwrap()); + assert_eq!(identifier, ""); + } + for mut identifier in vec!["123", "1_", " ",] { + println!("{}", identifier); + assert!(!identifier_input(&mut identifier).unwrap()); + assert_ne!(identifier, ""); + } + } + + proptest! { + #[test] + fn test_no_crash(s in ".{0,4096}") { + fn prep_parser<'s>(s: &'s str) -> Parser<'s, &'static str, &'static str, Dummy> { + let mut parser = Parser::<'s, &'static str, &'static str, Dummy> { + base: s, + preds: None, + objs: None, + pred_ids: Default::default(), + obj_ids: Default::default(), + consts: Default::default(), + closed_subtrees: (0..), + }; + parser.consts.protos.push(Default::default()); + parser + } + + let _ = prep_parser(&s).str_literal(&mut &*s); + let _ = prep_parser(&s).re_literal(&mut &*s); + let _ = prep_parser(&s).matcher(&mut &*s); + let _ = prep_parser(&s).tag(&mut &*s); + let _ = prep_parser(&s).sp(&mut &*s); + let _ = prep_parser(&s).identifier(&mut &*s); + let _ = prep_parser(&s).name(&mut &*s); + let _ = prep_parser(&s).parameter(&mut &*s); + let _ = prep_parser(&s).predicate(&mut &*s); + let _ = prep_parser(&s).key_subtree(&mut &*s); + let _ = prep_parser(&s).value_subtree(&mut &*s); + let _ = prep_parser(&s).unexpected_end(&mut &*s); + let _ = prep_parser(&s).unexpected_token(&mut &*s); + let _ = prep_parser(&s).subtree(&mut &*s); + let _ = prep_parser(&s).pattern(&mut &*s); + } + } +} + -- cgit 1.4.1