// Copyright (C) 2021-2022 Soni L.
// SPDX-License-Identifier: MIT OR Apache-2.0
//! The recursive-descent datafu language parser.
use std::borrow::Borrow;
use std::collections::BTreeMap;
use std::mem::ManuallyDrop;
use impl_trait::impl_trait;
use regex::Regex;
use serde::Serialize;
use crate::Predicate;
use crate::errors::PatternError;
use crate::vm::PatternConstants;
use crate::vm::PatternElement;
/// try! with bools. (the b comes from bool.)
macro_rules! bry {
($l:lifetime $e:expr) => {
if !$e {
break $l false;
}
};
($e:expr) => {
if !$e {
return false;
}
};
}
// the following macros rely on unlabeled-break-through-labeled-block being an
// error.
// NOTE: always test changes to this module on nightly!
// still waiting for label-break-value stabilization...
#[cfg(not(feature = "stable"))]
/// labeled block. on nightly: better compile errors. but also works on stable.
macro_rules! lblock {
// unfortunately ($l:lifetime : $b:block) => { $l: $b } didn't work.
($($t:tt)+) => {
$($t)+
}
}
#[cfg(feature = "stable")]
/// labeled block. on nightly: better compile errors. but also works on stable.
macro_rules! lblock {
($l:lifetime : $b:block) => {
$l: loop {
break $b
}
}
}
/// Attempts to shift `s` forward by removing `prefix`.
///
/// Returns whether `s` has had `prefix` removed.
// can't use Pattern here :(
fn strip_prefix(s: &mut &str, prefix: &str) -> bool {
s.strip_prefix(prefix).map(|ns| *s = ns).is_some()
}
/// Returns the position (index) of `sub` within `base`, in bytes.
///
/// Returns bogus results if `base` and `sub` are unrelated.
fn pos_of<'a>(base: &'a str, sub: &'a str) -> Option<usize> {
// FIXME is there any non-UB way to check if `sub` is in `base`?
Some((sub.as_ptr() as usize) - (base.as_ptr() as usize))
}
/// Helper to collect "subtree" sections of the pattern.
///
/// This is a RAII-like guard which handles cleaning up the parsed pattern when
/// dropped.
struct SubtreeHelper<'r, 's, PKey, OKey, O>
where
Self: 'r,
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
root: &'r mut Parser<'s, PKey, OKey, O>,
}
impl_trait! {
impl<'r, 's, PKey, OKey, O> SubtreeHelper<'r, 's, PKey, OKey, O>
where
Self: 'r,
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
fn start(value: &'r mut Parser<'s, PKey, OKey, O>) -> Self {
value.consts.protos.push(Default::default());
Self {
root: value,
}
}
fn commit(self) -> usize {
let mut self_ = ManuallyDrop::new(self);
let proto = self_.root.consts.protos.pop().unwrap();
let id = self_.root.closed_subtrees.next().unwrap();
self_.root.consts.protos.insert(id, proto);
id
}
impl trait std::ops::Deref {
type Target = Parser<'s, PKey, OKey, O>;
fn deref(&self) -> &Self::Target {
&*self.root
}
}
impl trait std::ops::DerefMut {
fn deref_mut(&mut self) -> &mut Self::Target {
self.root
}
}
impl trait Drop {
fn drop(&mut self) {
// remove "partial" proto
self.root.consts.protos.pop().expect("SubtreeHelper");
}
}
}
}
/// Helper to collect "tag" sections of the pattern.
///
/// This is a RAII-like guard which handles cleaning up the parsed pattern when
/// dropped.
struct TagHelper<'r, 's, PKey, OKey, O>
where
Self: 'r,
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
root: &'r mut Parser<'s, PKey, OKey, O>,
len: usize,
}
impl_trait! {
impl<'r, 's, PKey, OKey, O> TagHelper<'r, 's, PKey, OKey, O>
where
Self: 'r,
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
fn start(value: &'r mut Parser<'s, PKey, OKey, O>) -> Self {
let len = value.consts.protos.last().unwrap().len();
Self {
root: value,
len : len,
}
}
fn commit(self) {
std::mem::forget(self)
}
impl trait std::ops::Deref {
type Target = Parser<'s, PKey, OKey, O>;
fn deref(&self) -> &Self::Target {
&*self.root
}
}
impl trait std::ops::DerefMut {
fn deref_mut(&mut self) -> &mut Self::Target {
self.root
}
}
impl trait Drop {
fn drop(&mut self) {
let proto = self.root.consts.protos.last_mut().unwrap();
assert!(proto.len() >= self.len);
while proto.len() > self.len {
let _ = proto.pop();
}
}
}
}
}
struct Parser<'s, PKey, OKey, O>
where
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
base: &'s str,
preds: Option<BTreeMap<PKey, Box<Predicate>>>,
objs: Option<BTreeMap<OKey, O>>,
pred_ids: BTreeMap<PKey, usize>,
obj_ids: BTreeMap<OKey, usize>,
consts: PatternConstants<O>,
closed_subtrees: std::ops::RangeFrom<usize>,
}
// These are documented using LPeg.re syntax
// http://www.inf.puc-rio.br/~roberto/lpeg/re.html
#[rustfmt::skip]
impl<'s, PKey, OKey, O> Parser<'s, PKey, OKey, O>
where
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
/// str_literal <- sp ( ( "'" str_char* ( "'" / ( !. -> ErrorStrEnd ) ) ( '?' -> MarkSkippable ) ) -> String ) sp
/// str_char <- ( str_escape / [^%'] )
/// str_escape <- '%' ( '%' / "'" ) / ( ( '%' .? ) -> ErrorStrEscape )
fn str_literal(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
const STRING_TOKEN: &str = "'";
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, STRING_TOKEN));
let mut string = String::new();
loop {
let base = cursor;
if strip_prefix(&mut cursor, "%") {
if strip_prefix(&mut cursor, STRING_TOKEN) {
string.push_str(STRING_TOKEN);
} else if strip_prefix(&mut cursor, "%") {
string.push_str("%");
} else {
return Err(PatternError::StringEscape(pos_of(self.base, base).unwrap(), cursor));
}
} else {
cursor = cursor.trim_start_matches(|c: char| {
!matches!(c, '%' | '\'')
});
if base == cursor { // no match?
break;
}
string.push_str(&base[..pos_of(base, cursor).unwrap()]);
}
}
if !strip_prefix(&mut cursor, STRING_TOKEN) {
match cursor {
"" => return Err(PatternError::StringEnd(self.base.len(), cursor)),
_ => unreachable!(),
}
}
let skippable = strip_prefix(&mut cursor, "?");
self.sp(&mut cursor);
let id = self.consts.strings.iter().position(|c| c == &string);
let self_ = &mut *self;
let id = id.unwrap_or_else(move || {
string.shrink_to_fit();
self_.consts.strings.push(string);
self_.consts.strings.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::StringKey(id, skippable));
*s = cursor;
true
}))
}
/// re_literal <- sp ( ( "/" re_char* ( "/" / ( !. -> ErrorReEnd ) ) ( '?' -> MarkSkippable ) ) -> Regex ) sp
/// re_char <- ( re_escape / [^%/] )
/// re_escape <- '%' ( '%' / "/" ) / ( ( '%' .? ) -> ErrorReEscape )
#[allow(unused_variables)]
#[allow(unreachable_code)]
fn re_literal(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
const RE_TOKEN: &str = "/";
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
let orig = cursor;
bry!('matches strip_prefix(&mut cursor, RE_TOKEN));
let mut string = String::new();
loop {
let base = cursor;
if strip_prefix(&mut cursor, "%") {
if strip_prefix(&mut cursor, RE_TOKEN) {
string.push_str(RE_TOKEN);
} else if strip_prefix(&mut cursor, "%") {
string.push_str("%");
} else {
return Err(PatternError::RegexEscape(pos_of(self.base, base).unwrap(), cursor));
}
} else {
cursor = cursor.trim_start_matches(|c: char| {
!matches!(c, '%' | '/')
});
if base == cursor { // no match?
break;
}
string.push_str(&base[..pos_of(base, cursor).unwrap()]);
}
}
if !strip_prefix(&mut cursor, RE_TOKEN) {
match cursor {
"" => return Err(PatternError::RegexEnd(self.base.len(), cursor)),
_ => unreachable!(),
}
}
let end = cursor;
let skippable = strip_prefix(&mut cursor, "?");
self.sp(&mut cursor);
let re = Regex::new(&string).map_err(|e| {
PatternError::Regex(pos_of(self.base, orig).unwrap(), &orig[..pos_of(orig, end).unwrap()], e)
})?;
let id = self.consts.regices.iter().position(|c| c.as_str() == re.as_str());
let self_ = &mut *self;
let id = id.unwrap_or_else(move || {
self_.consts.regices.push(re);
self_.consts.regices.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::RegexKey(id, skippable));
*s = cursor;
true
}))
}
/// matcher <- sp ( parameter / str_literal / re_literal / key_subtree ) sp
fn matcher(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
match () {
_ if self.parameter(&mut cursor)? => {},
_ if self.str_literal(&mut cursor)? => {},
_ if self.re_literal(&mut cursor)? => {},
_ if self.key_subtree(&mut cursor)? => {},
_ => bry!('matches false),
}
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// tag <- sp ( '->' -> Arrow ) sp ( matcher / name sp matcher? ) sp predicate* ( '' -> End ) sp
fn tag(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "->"));
let mut self_ = TagHelper::start(&mut *self);
{
let proto = self_.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::Arrow);
}
self_.sp(&mut cursor);
if !self_.matcher(&mut cursor)? {
bry!('matches self_.name(&mut cursor)?);
self_.sp(&mut cursor);
// NOTE: *optional*
let _ = self_.matcher(&mut cursor)?;
}
self_.sp(&mut cursor);
while self_.predicate(&mut cursor)? {
}
{
let proto = self_.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::End);
}
self_.commit();
*s = cursor;
true
}))
}
/// sp <- spaces*
///
/// (always succeeds, so doesn't return anything)
fn sp(&mut self, s: &mut &'s str) {
*s = s.trim_start_matches(|c: char| {
matches!(c, ' ' | '\n' | '\r' | '\t')
});
}
/// identifier <- [A-Za-z_][A-Za-z0-9_]*
fn identifier(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
if let Some('A'..='Z') | Some('a'..='z') | Some('_') = s.chars().next() {
*s = s.trim_start_matches(|c: char| {
matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_')
});
Ok(true)
} else {
Ok(false)
}
}
/// name <- sp ( identifier -> Identifier ) sp
fn name(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.consts.strings.iter().position(|c| c == name);
let id = id.unwrap_or_else(|| {
self.consts.strings.push(name.into());
self.consts.strings.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::Identifier(id));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// parameter <- sp '$' ( '?'? -> MarkSkippable ) ( identifier -> Parameter ) sp
fn parameter(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "$"));
let skippable = strip_prefix(&mut cursor, "?");
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.obj_ids.get(name).copied().map_or_else(
|| {
let (key, obj) = self.objs.as_mut().and_then(|map| {
map.remove_entry(name)
}).ok_or_else(|| {
let pos = pos_of(self.base, start).unwrap();
PatternError::UnknownParameter(pos, name)
})?;
let id = self.consts.defs.len();
self.consts.defs.push(obj);
self.obj_ids.insert(key, id);
Ok(id)
},
Ok,
)?;
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ParameterKey(id, skippable));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// predicate <- sp ':' ( '?'? -> MarkSkippable ) '$' ( identifier -> Predicate ) sp
fn predicate(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, ":"));
let skippable = strip_prefix(&mut cursor, "?");
bry!('matches strip_prefix(&mut cursor, "$"));
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.pred_ids.get(name).copied().map_or_else(
|| {
let (key, pred) = self.preds.as_mut().and_then(|map| {
map.remove_entry(name)
}).ok_or_else(|| {
let pos = pos_of(self.base, start).unwrap();
PatternError::UnknownPredicate(pos, name)
})?;
let id = self.consts.predicates.len();
self.consts.predicates.push(pred);
self.pred_ids.insert(key, id);
Ok(id)
},
Ok,
)?;
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ApplyPredicate(id, skippable));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// key_subtree <- sp '[' sp name? sp predicate* sp subtree sp ( ']' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable )
fn key_subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "["));
self.sp(&mut cursor);
let mut subtree = SubtreeHelper::start(&mut *self);
subtree.name(&mut cursor)?;
subtree.sp(&mut cursor);
while subtree.predicate(&mut cursor)? {
}
subtree.sp(&mut cursor);
bry!('matches subtree.subtree(&mut cursor)?);
subtree.sp(&mut cursor);
bry!('matches
strip_prefix(&mut cursor, "]")
||
subtree.unexpected_token(&mut cursor)?
||
subtree.unexpected_end(&mut cursor)?
);
subtree.sp(&mut cursor);
let skippable = strip_prefix(&mut cursor, "?");
*s = cursor;
let id = subtree.commit();
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::KeySubtree(id, skippable));
true
}))
}
/// value_subtree <- sp '(' sp predicate* sp subtree sp ( ')' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable )
fn value_subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "("));
self.sp(&mut cursor);
let mut subtree = SubtreeHelper::start(&mut *self);
while subtree.predicate(&mut cursor)? {
}
subtree.sp(&mut cursor);
bry!('matches subtree.subtree(&mut cursor)?);
subtree.sp(&mut cursor);
bry!('matches
strip_prefix(&mut cursor, ")")
||
subtree.unexpected_token(&mut cursor)?
||
subtree.unexpected_end(&mut cursor)?
);
subtree.sp(&mut cursor);
let skippable = strip_prefix(&mut cursor, "?");
*s = cursor;
let id = subtree.commit();
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ValueSubtree(id, skippable));
true
}))
}
/// unexpected_end <- ( !. ) -> ErrorToken
fn unexpected_end(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
match *s {
"" => Err(PatternError::Token(self.base.len(), s)),
_ => Ok(false),
}
}
/// unexpected_token <- . -> ErrorToken
fn unexpected_token(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
match *s {
"" => Ok(false),
_ => Err(PatternError::Token(pos_of(self.base, s).unwrap(), s)),
}
}
/// subtree <- sp tag* sp ( value_subtree '' -> End )* sp
fn subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
self.sp(&mut cursor);
while self.tag(&mut cursor)? {
}
self.sp(&mut cursor);
while self.value_subtree(&mut cursor)? {
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::End);
}
self.sp(&mut cursor);
*s = cursor;
// always matches (may match empty)
Ok(true)
}
/// pattern <- ( subtree / unexpected_token ) ( !. / unexpected_token )
fn pattern(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
let mut subtree = SubtreeHelper::start(&mut *self);
bry!('matches
subtree.subtree(&mut cursor)?
||
subtree.unexpected_token(&mut cursor)?
);
bry!('matches
cursor == ""
||
subtree.unexpected_token(&mut cursor)?
);
subtree.commit();
*s = cursor;
true
}))
}
}
/// Parses a DFU expression.
///
/// The given
pub(crate) fn parse<'s, PKey, OKey, O>(
input: &'s str,
preds: Option<BTreeMap<PKey, Box<Predicate>>>,
objs: Option<BTreeMap<OKey, O>>
) -> Result<PatternConstants<O>, PatternError<'s>>
where
PKey: Borrow<str> + Ord,
OKey: Borrow<str> + Ord,
O: Serialize,
{
let mut parser = Parser::<'s, PKey, OKey, O> {
base: input,
preds: preds,
objs: objs,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
let mut parsed = input;
let matched = parser.pattern(&mut parsed)?;
assert!(matched);
assert_eq!(parsed, "");
assert_eq!(parser.closed_subtrees.next().unwrap(), parser.consts.protos.len());
Ok(parser.consts)
}
#[cfg(test)]
mod tests {
use crate::errors::PatternError;
use super::Parser;
use proptest::prelude::*;
#[test]
fn test_identifier() {
fn identifier_input<'s>(s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut parser = Parser::<'s, &'static str, &'static str, ()> {
base: *s,
preds: None,
objs: None,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
parser.identifier(s)
}
for mut identifier in vec!["test", "Test", "_test", "_Test", "_123",] {
println!("{}", identifier);
assert!(identifier_input(&mut identifier).unwrap());
assert_eq!(identifier, "");
}
for mut identifier in vec!["123", "1_", " ",] {
println!("{}", identifier);
assert!(!identifier_input(&mut identifier).unwrap());
assert_ne!(identifier, "");
}
}
proptest! {
#[test]
fn test_no_crash(s in ".{0,4096}") {
fn prep_parser<'s>(s: &'s str) -> Parser<'s, &'static str, &'static str, ()> {
let mut parser = Parser::<'s, &'static str, &'static str, ()> {
base: s,
preds: None,
objs: None,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
parser.consts.protos.push(Default::default());
parser
}
let _ = prep_parser(&s).str_literal(&mut &*s);
let _ = prep_parser(&s).re_literal(&mut &*s);
let _ = prep_parser(&s).matcher(&mut &*s);
let _ = prep_parser(&s).tag(&mut &*s);
let _ = prep_parser(&s).sp(&mut &*s);
let _ = prep_parser(&s).identifier(&mut &*s);
let _ = prep_parser(&s).name(&mut &*s);
let _ = prep_parser(&s).parameter(&mut &*s);
let _ = prep_parser(&s).predicate(&mut &*s);
let _ = prep_parser(&s).key_subtree(&mut &*s);
let _ = prep_parser(&s).value_subtree(&mut &*s);
let _ = prep_parser(&s).unexpected_end(&mut &*s);
let _ = prep_parser(&s).unexpected_token(&mut &*s);
let _ = prep_parser(&s).subtree(&mut &*s);
let _ = prep_parser(&s).pattern(&mut &*s);
}
}
}