/*
* This file is part of Datafu
* Copyright (C) 2021 Soni L.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::borrow::Borrow;
use std::collections::BTreeMap;
use std::mem::ManuallyDrop;
use regex::Regex;
use crate::PatternTypes;
use crate::Predicate;
use crate::errors::PatternError;
use crate::vm::PatternConstants;
use crate::vm::PatternElement;
/// try! with bools. (the b comes from bool.)
macro_rules! bry {
($l:lifetime $e:expr) => {
if !$e {
break $l false;
}
};
($e:expr) => {
if !$e {
return false;
}
};
}
// the following macros rely on unlabeled-break-through-labeled-block being an
// error.
// NOTE: always test changes to this module on nightly!
#[cfg(not(feature = "stable"))]
/// labeled block. on nightly: better compile errors. but also works on stable.
macro_rules! lblock {
// unfortunately ($l:lifetime : $b:block) => { $l: $b } didn't work.
($($t:tt)+) => {
$($t)+
}
}
#[cfg(feature = "stable")]
/// labeled block. on nightly: better compile errors. but also works on stable.
macro_rules! lblock {
($l:lifetime : $b:block) => {
$l: loop {
break $b
}
}
}
// can't use Pattern here :(
fn strip_prefix(s: &mut &str, prefix: &str) -> bool {
s.strip_prefix(prefix).map(|ns| *s = ns).is_some()
}
fn pos_of<'a>(base: &'a str, sub: &'a str) -> Option<usize> {
// FIXME
Some((sub.as_ptr() as usize) - (base.as_ptr() as usize))
}
struct SubtreeHelper<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> where Self: 'r {
root: &'r mut Parser<'s, P, O, T>,
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> SubtreeHelper<'r, 's, P, O, T> where Self: 'r {
fn start(value: &'r mut Parser<'s, P, O, T>) -> Self {
value.consts.protos.push(Default::default());
Self {
root: value,
}
}
fn commit(self) -> usize {
let mut self_ = ManuallyDrop::new(self);
let proto = self_.root.consts.protos.pop().unwrap();
let id = self_.root.closed_subtrees.next().unwrap();
self_.root.consts.protos.insert(id, proto);
id
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> std::ops::Deref for SubtreeHelper<'r, 's, P, O, T> where Self: 'r {
type Target = Parser<'s, P, O, T>;
fn deref(&self) -> &Self::Target {
&*self.root
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> std::ops::DerefMut for SubtreeHelper<'r, 's, P, O, T> where Self: 'r {
fn deref_mut(&mut self) -> &mut Self::Target {
self.root
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> Drop for SubtreeHelper<'r, 's, P, O, T> where Self: 'r {
fn drop(&mut self) {
// remove "partial" proto
self.root.consts.protos.pop().expect("SubtreeHelper");
}
}
struct TagHelper<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> where Self: 'r {
root: &'r mut Parser<'s, P, O, T>,
len: usize,
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> TagHelper<'r, 's, P, O, T> where Self: 'r {
fn start(value: &'r mut Parser<'s, P, O, T>) -> Self {
let len = value.consts.protos.last().unwrap().len();
Self {
root: value,
len : len,
}
}
fn commit(self) {
std::mem::forget(self)
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> std::ops::Deref for TagHelper<'r, 's, P, O, T> where Self: 'r {
type Target = Parser<'s, P, O, T>;
fn deref(&self) -> &Self::Target {
&*self.root
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> std::ops::DerefMut for TagHelper<'r, 's, P, O, T> where Self: 'r {
fn deref_mut(&mut self) -> &mut Self::Target {
self.root
}
}
impl<'r, 's, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> Drop for TagHelper<'r, 's, P, O, T> where Self: 'r {
fn drop(&mut self) {
let proto = self.root.consts.protos.last_mut().unwrap();
assert!(proto.len() >= self.len);
while proto.len() > self.len {
let _ = proto.pop();
}
}
}
struct Parser<'s, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> {
base: &'s str,
preds: Option<BTreeMap<P, Box<Predicate<T>>>>,
objs: Option<BTreeMap<O, T::Own>>,
pred_ids: BTreeMap<P, usize>,
obj_ids: BTreeMap<O, usize>,
consts: PatternConstants<T>,
closed_subtrees: std::ops::RangeFrom<usize>,
}
// These are documented using LPeg.re syntax
// http://www.inf.puc-rio.br/~roberto/lpeg/re.html
#[rustfmt::skip]
impl<'s, P: Borrow<str> + Ord, O: Borrow<str> + Ord, T: PatternTypes> Parser<'s, P, O, T> {
/// str_literal <- sp ( ( "'" str_char* ( "'" / ( !. -> ErrorStrEnd ) ) ( '?' -> MarkSkippable ) ) -> String ) sp
/// str_char <- ( str_escape / [^%'] )
/// str_escape <- '%' ( '%' / "'" ) / ( ( '%' .? ) -> ErrorStrEscape )
fn str_literal(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
const STRING_TOKEN: &str = "'";
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, STRING_TOKEN));
let mut string = String::new();
loop {
let base = cursor;
if strip_prefix(&mut cursor, "%") {
if strip_prefix(&mut cursor, STRING_TOKEN) {
string.push_str(STRING_TOKEN);
} else if strip_prefix(&mut cursor, "%") {
string.push_str("%");
} else {
return Err(PatternError::StringEscape(pos_of(self.base, base).unwrap(), cursor));
}
} else {
cursor = cursor.trim_start_matches(|c: char| {
!matches!(c, '%' | '\'')
});
if base == cursor { // no match?
break;
}
string.push_str(&base[..pos_of(base, cursor).unwrap()]);
}
}
if !strip_prefix(&mut cursor, STRING_TOKEN) {
match cursor {
"" => return Err(PatternError::StringEnd(self.base.len(), cursor)),
_ => unreachable!(),
}
}
let skippable = strip_prefix(&mut cursor, "?");
self.sp(&mut cursor);
let id = self.consts.strings.iter().position(|c| c == &string);
let self_ = &mut *self;
let id = id.unwrap_or_else(move || {
string.shrink_to_fit();
self_.consts.strings.push(string);
self_.consts.strings.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::StringKey(id, skippable));
*s = cursor;
true
}))
}
/// re_literal <- sp ( ( "/" re_char* ( "/" / ( !. -> ErrorReEnd ) ) ( '?' -> MarkSkippable ) ) -> Regex ) sp
/// re_char <- ( re_escape / [^%/] )
/// re_escape <- '%' ( '%' / "/" ) / ( ( '%' .? ) -> ErrorReEscape )
#[allow(unused_variables)]
#[allow(unreachable_code)]
fn re_literal(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
const RE_TOKEN: &str = "/";
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
let orig = cursor;
bry!('matches strip_prefix(&mut cursor, RE_TOKEN));
let mut string = String::new();
loop {
let base = cursor;
if strip_prefix(&mut cursor, "%") {
if strip_prefix(&mut cursor, RE_TOKEN) {
string.push_str(RE_TOKEN);
} else if strip_prefix(&mut cursor, "%") {
string.push_str("%");
} else {
return Err(PatternError::RegexEscape(pos_of(self.base, base).unwrap(), cursor));
}
} else {
cursor = cursor.trim_start_matches(|c: char| {
!matches!(c, '%' | '/')
});
if base == cursor { // no match?
break;
}
string.push_str(&base[..pos_of(base, cursor).unwrap()]);
}
}
if !strip_prefix(&mut cursor, RE_TOKEN) {
match cursor {
"" => return Err(PatternError::RegexEnd(self.base.len(), cursor)),
_ => unreachable!(),
}
}
let end = cursor;
let skippable = strip_prefix(&mut cursor, "?");
self.sp(&mut cursor);
let re = Regex::new(&string).map_err(|e| {
PatternError::Regex(pos_of(self.base, orig).unwrap(), &orig[..pos_of(orig, end).unwrap()], e)
})?;
let id = self.consts.regices.iter().position(|c| c.as_str() == re.as_str());
let self_ = &mut *self;
let id = id.unwrap_or_else(move || {
self_.consts.regices.push(re);
self_.consts.regices.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::RegexKey(id, skippable));
*s = cursor;
true
}))
}
/// matcher <- sp ( parameter / str_literal / re_literal / key_subtree ) sp
fn matcher(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
match () {
_ if self.parameter(&mut cursor)? => {},
_ if self.str_literal(&mut cursor)? => {},
_ if self.re_literal(&mut cursor)? => {},
_ if self.key_subtree(&mut cursor)? => {},
_ => bry!('matches false),
}
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// tag <- sp ( '->' -> Arrow ) sp ( matcher / name sp matcher? ) sp predicate* ( '' -> End ) sp
fn tag(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "->"));
let mut self_ = TagHelper::start(&mut *self);
{
let proto = self_.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::Arrow);
}
self_.sp(&mut cursor);
if !self_.matcher(&mut cursor)? {
bry!('matches self_.name(&mut cursor)?);
self_.sp(&mut cursor);
// NOTE: *optional*
let _ = self_.matcher(&mut cursor)?;
}
self_.sp(&mut cursor);
while self_.predicate(&mut cursor)? {
}
{
let proto = self_.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::End);
}
self_.commit();
*s = cursor;
true
}))
}
/// sp <- spaces*
///
/// (always succeeds, so doesn't return anything)
fn sp(&mut self, s: &mut &'s str) {
*s = s.trim_start_matches(|c: char| {
matches!(c, ' ' | '\n' | '\r' | '\t')
});
}
/// identifier <- [A-Za-z_][A-Za-z0-9_]*
fn identifier(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
if let Some('A'..='Z') | Some('a'..='z') | Some('_') = s.chars().next() {
*s = s.trim_start_matches(|c: char| {
matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_')
});
Ok(true)
} else {
Ok(false)
}
}
/// name <- sp ( identifier -> Identifier ) sp
fn name(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.consts.strings.iter().position(|c| c == name);
let id = id.unwrap_or_else(|| {
self.consts.strings.push(name.into());
self.consts.strings.len() - 1
});
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::Identifier(id));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// parameter <- sp '$' ( '?'? -> MarkSkippable ) ( identifier -> Parameter ) sp
fn parameter(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "$"));
let skippable = strip_prefix(&mut cursor, "?");
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.obj_ids.get(name).copied().map_or_else(
|| {
let (key, obj) = self.objs.as_mut().and_then(|map| {
map.remove_entry(name)
}).ok_or_else(|| {
let pos = pos_of(self.base, start).unwrap();
PatternError::UnknownParameter(pos, name)
})?;
let id = self.consts.defs.len();
self.consts.defs.push(obj);
self.obj_ids.insert(key, id);
Ok(id)
},
Ok,
)?;
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ParameterKey(id, skippable));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// predicate <- sp ':' ( '?'? -> MarkSkippable ) '$' ( identifier -> Predicate ) sp
fn predicate(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, ":"));
let skippable = strip_prefix(&mut cursor, "?");
bry!('matches strip_prefix(&mut cursor, "$"));
let start = cursor;
bry!('matches self.identifier(&mut cursor)?);
let name = &start[..pos_of(start, cursor).unwrap_or(start.len())];
// no processing of `name` is required for this.
let id = self.pred_ids.get(name).copied().map_or_else(
|| {
let (key, pred) = self.preds.as_mut().and_then(|map| {
map.remove_entry(name)
}).ok_or_else(|| {
let pos = pos_of(self.base, start).unwrap();
PatternError::UnknownPredicate(pos, name)
})?;
let id = self.consts.predicates.len();
self.consts.predicates.push(pred);
self.pred_ids.insert(key, id);
Ok(id)
},
Ok,
)?;
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ApplyPredicate(id, skippable));
self.sp(&mut cursor);
*s = cursor;
true
}))
}
/// key_subtree <- sp '[' sp predicate* sp subtree sp ( ']' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable )
fn key_subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "["));
self.sp(&mut cursor);
let mut subtree = SubtreeHelper::start(&mut *self);
while subtree.predicate(&mut cursor)? {
}
subtree.sp(&mut cursor);
bry!('matches subtree.subtree(&mut cursor)?);
subtree.sp(&mut cursor);
bry!('matches
strip_prefix(&mut cursor, "]")
||
subtree.unexpected_token(&mut cursor)?
||
subtree.unexpected_end(&mut cursor)?
);
subtree.sp(&mut cursor);
let skippable = strip_prefix(&mut cursor, "?");
*s = cursor;
let id = subtree.commit();
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::KeySubtree(id, skippable));
true
}))
}
/// value_subtree <- sp '(' sp predicate* sp subtree sp ( ')' / unexpected_token / unexpected_end ) sp ( '?'? -> MarkSkippable )
fn value_subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
self.sp(&mut cursor);
bry!('matches strip_prefix(&mut cursor, "("));
self.sp(&mut cursor);
let mut subtree = SubtreeHelper::start(&mut *self);
while subtree.predicate(&mut cursor)? {
}
subtree.sp(&mut cursor);
bry!('matches subtree.subtree(&mut cursor)?);
subtree.sp(&mut cursor);
bry!('matches
strip_prefix(&mut cursor, ")")
||
subtree.unexpected_token(&mut cursor)?
||
subtree.unexpected_end(&mut cursor)?
);
subtree.sp(&mut cursor);
let skippable = strip_prefix(&mut cursor, "?");
*s = cursor;
let id = subtree.commit();
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::ValueSubtree(id, skippable));
true
}))
}
/// unexpected_end <- ( !. ) -> ErrorToken
fn unexpected_end(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
match *s {
"" => Err(PatternError::Token(self.base.len(), s)),
_ => Ok(false),
}
}
/// unexpected_token <- . -> ErrorToken
fn unexpected_token(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
match *s {
"" => Ok(false),
_ => Err(PatternError::Token(pos_of(self.base, s).unwrap(), s)),
}
}
/// subtree <- sp tag* sp ( value_subtree '' -> End )* sp
fn subtree(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
self.sp(&mut cursor);
while self.tag(&mut cursor)? {
}
self.sp(&mut cursor);
while self.value_subtree(&mut cursor)? {
let proto = self.consts.protos.last_mut().expect("protos");
proto.push(PatternElement::End);
}
self.sp(&mut cursor);
*s = cursor;
// always matches (may match empty)
Ok(true)
}
/// pattern <- ( subtree / unexpected_token ) ( !. / unexpected_token )
fn pattern(&mut self, s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut cursor = *s;
Ok(lblock!('matches: {
let mut subtree = SubtreeHelper::start(&mut *self);
bry!('matches
subtree.subtree(&mut cursor)?
||
subtree.unexpected_token(&mut cursor)?
);
bry!('matches
cursor == ""
||
subtree.unexpected_token(&mut cursor)?
);
subtree.commit();
*s = cursor;
true
}))
}
}
pub(crate) fn parse<'s, P, O, T>(
input: &'s str,
preds: Option<BTreeMap<P, Box<Predicate<T>>>>,
objs: Option<BTreeMap<O, T::Own>>
) -> Result<PatternConstants<T>, PatternError<'s>>
where
P: Borrow<str> + Ord,
O: Borrow<str> + Ord,
T: PatternTypes,
{
let mut parser = Parser::<'s, P, O, T> {
base: input,
preds: preds,
objs: objs,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
let mut parsed = input;
let matched = parser.pattern(&mut parsed)?;
assert!(matched);
assert_eq!(parsed, "");
assert_eq!(parser.closed_subtrees.next().unwrap(), parser.consts.protos.len());
Ok(parser.consts)
}
#[cfg(test)]
mod tests {
use crate::PatternTypes;
use crate::RefOwn;
use crate::KVPair;
use crate::errors::PatternError;
use super::Parser;
use proptest::prelude::*;
struct Dummy;
impl PatternTypes for Dummy {
type Ref = ();
type Own = ();
fn pairs<'b>(
item: RefOwn<'b, Self::Ref, Self::Own>
) -> Option<Box<dyn Iterator<Item=KVPair<'b, Self>> + 'b>> {
let _ = item;
None
}
fn get<'a, 'b>(
item: RefOwn<'b, Self::Ref, Self::Own>,
key: RefOwn<'a, Self::Ref, Self::Own>
) -> Option<Option<KVPair<'b, Self>>> {
let _ = item;
let _ = key;
None
}
fn matches(
left: RefOwn<'_, Self::Ref, Self::Own>,
right: RefOwn<'_, Self::Ref, Self::Own>
) -> bool {
let _ = left;
let _ = right;
false
}
fn as_str<'b>(
item: RefOwn<'b, Self::Ref, Self::Own>
) -> Option<&'b str> {
match item {
RefOwn::Str(key) => Some(key),
_ => None,
}
}
}
#[test]
fn test_identifier() {
fn identifier_input<'s>(s: &mut &'s str) -> Result<bool, PatternError<'s>> {
let mut parser = Parser::<'s, &'static str, &'static str, Dummy> {
base: *s,
preds: None,
objs: None,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
parser.identifier(s)
}
for mut identifier in vec!["test", "Test", "_test", "_Test", "_123",] {
println!("{}", identifier);
assert!(identifier_input(&mut identifier).unwrap());
assert_eq!(identifier, "");
}
for mut identifier in vec!["123", "1_", " ",] {
println!("{}", identifier);
assert!(!identifier_input(&mut identifier).unwrap());
assert_ne!(identifier, "");
}
}
proptest! {
#[test]
fn test_no_crash(s in ".{0,4096}") {
fn prep_parser<'s>(s: &'s str) -> Parser<'s, &'static str, &'static str, Dummy> {
let mut parser = Parser::<'s, &'static str, &'static str, Dummy> {
base: s,
preds: None,
objs: None,
pred_ids: Default::default(),
obj_ids: Default::default(),
consts: Default::default(),
closed_subtrees: (0..),
};
parser.consts.protos.push(Default::default());
parser
}
let _ = prep_parser(&s).str_literal(&mut &*s);
let _ = prep_parser(&s).re_literal(&mut &*s);
let _ = prep_parser(&s).matcher(&mut &*s);
let _ = prep_parser(&s).tag(&mut &*s);
let _ = prep_parser(&s).sp(&mut &*s);
let _ = prep_parser(&s).identifier(&mut &*s);
let _ = prep_parser(&s).name(&mut &*s);
let _ = prep_parser(&s).parameter(&mut &*s);
let _ = prep_parser(&s).predicate(&mut &*s);
let _ = prep_parser(&s).key_subtree(&mut &*s);
let _ = prep_parser(&s).value_subtree(&mut &*s);
let _ = prep_parser(&s).unexpected_end(&mut &*s);
let _ = prep_parser(&s).unexpected_token(&mut &*s);
let _ = prep_parser(&s).subtree(&mut &*s);
let _ = prep_parser(&s).pattern(&mut &*s);
}
}
}