From 6dd30531ac62f6a3a564b7341d43f6cd71b90794 Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Tue, 15 Nov 2022 08:23:02 -0300 Subject: Rework subtrees again and update docs --- src/lib.rs | 127 ++++++++++++------------- src/parser.rs | 12 +++ src/vm/de.rs | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- src/vm/mod.rs | 4 +- 4 files changed, 348 insertions(+), 90 deletions(-) (limited to 'src') diff --git a/src/lib.rs b/src/lib.rs index 6d8a7e0..824d54e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,89 +14,76 @@ //! //! ## Syntax Elements of Datafu Expressions //! -//! FIXME still need to update these... -//! -//! An arrow is `->` and indicates indexing/iteration. Whether indexing or -//! iteration is used is defined by the elements that follow, with iteration -//! being used by default. -//! -//! A variable is a sequence of alphanumeric characters, not starting with -//! a digit. The value of the matched element will be identified by this name. -//! -//! A literal is a sequence of characters delimited by `'`, optionally -//! followed by `?`, with `%` as the escape character, and defines a -//! string-keyed indexing operation. A literal can contain any character, -//! except unescaped `%` or `'` symbols, which must be escaped as -//! `%%` and `%'`, respectively. The sequence of characters defined by -//! a literal is used as the string object in the indexing operation. -//! -//! A parameter is `$`, optionally followed by `?`, followed by a -//! sequence of alphanumeric characters, not starting with a digit, and -//! defines an object-keyed indexing operation. The sequence of characters -//! defined by a parameter is used to retrieve, from the pattern's -//! definitions, the object to be used in the indexing operation. -//! -//! A regex is a sequence of characters delimited by `/`, optionally -//! followed by `?`, with `%` as the escape character. A regex can -//! contain any character, except unescaped `%` or `/` symbols, which -//! must be escaped as `%%` and `%/`, respectively. The sequence of -//! characters defined by a regex is passed to the `regex` crate, which -//! may apply further restrictions on the characters used, and is used to -//! accept the respective keys processed by the iterator. -//! -//! A predicate is `:`, optionally followed by `?`, followed by an -//! `$` and a sequence of alphanumeric characters, not starting with a -//! digit, and is used to accept values to be processed based on an -//! external [`Predicate`]. -//! -//! A key match is a datafu expression (including, but not limited to, the -//! empty datafu expression) enclosed within `[` and `]`, optionally -//! prefixed with an identifier and zero or more predicates, and applies the -//! enclosed predicates and datafu expression to the key (or index) being -//! processed. A key match enables additional validation of keys and/or -//! extraction of values from keys, and accepts a key if and only if the -//! enclosed predicates accept the key and the enclosed expression matches the -//! key. The matched key is stored in the identifier. -//! -//! A subvalue is a datafu expression (including, but not limited to, the -//! empty datafu expression) enclosed within `(` and `)`, and applies -//! the enclosed datafu expression to the value (or index) being processed. -//! A subvalue enables the ability to match multiple values on the same -//! object, and accepts a value if and only the enclosed expression -//! matches the value. A subvalue can be made optional by the presence of -//! a `?` after the subvalue - in case of no match, it will just omit -//! the relevant keys in the result. Optional subvalues are unrelated to -//! non-validating syntax elements (see below), they just use the same -//! syntax. +//! A datafu pattern starts with an optional value matcher and is otherwise a +//! sequence of iterative elements (see "arrow" below) followed by subvalues. +//! +//! A datafu pattern is composed of the following elements: +//! +//! 1. An arrow, `->` indicates iteration. (Note that datafu operates directly +//! on serde and deserialization is an iterative process.) +//! +//! Always followed by a matcher, or a name and an optional matcher. +//! 2. A name is a sequence of alphanumeric characters, not starting with a +//! digit. A name collects a matched value. The value will be identified by +//! the name. +//! 3. A matcher is one of the following 5 elements. +//! 4. A literal is a quoted string delimited by `'` with `%` as the escape +//! character. A literal can contain any Unicode scalar value, and the only +//! allowed escape codes are `%%` and `%'`, for `%` and `'` respectively, +//! which must be escaped. A literal matches a string value and can be +//! optional. +//! 5. A parameter is `$`, optionally followed by `?`, followed by a sequence +//! of alphanumeric characters, not starting with a digit. This is +//! currently unimplemented. +//! 6. A regex is a quoted string delimited by `/`, with `%` as the escape +//! character. A regex can contain any Unicode scalar value, and the only +//! allowed escape codes are `%%` and `%/`, for `%` and `/` respectively, +//! which must be escaped. A regex element matches a string value against +//! the regex it represents, and can be optional. +//! 7. A predicate is `:`, optionally followed by `?`, followed by `$` and a +//! sequence of alphanumeric characters, not starting with a digit. This is +//! currently unimplemented. +//! 8. A type is `:`, optionally followed by `?`, followed by one of the +//! following keywords: TODO +//! 9. A key match is a sequence of iterative elements followed by subvalues, +//! prefixed by either a matcher or a name and an optional matcher, +//! and enclosed within `[` and `]`, optionally followed by `?`. A key +//! match applies to map keys and sequence indices, and aside from the +//! previously mentioned requirement is otherwise just a datafu pattern. +//! 10. A subvalue is a sequence of iterative elements followed by subvalues, +//! enclosed within `(` and `)`, optionally followed by `?`. A sequence of +//! subvalues enables matching distinct patterns on the same value, like +//! distinct fields of a struct. //! //! Some syntax elements can be validating or non-validating. Validating //! syntax elements will return a [`errors::MatchError::ValidationError`] -//! whenever a non-accepted element is encountered, whereas non-validating -//! ones will skip them. Whether an element is validating is determined by -//! the absence of an optional `?` in the documented position. Note that -//! it is possible for a validating syntax element to still yield results -//! before returning a [`errors::MatchError::ValidationError`], so one -//! needs to be careful when writing code where such behaviour could -//! result in a security vulnerability. +//! whenever a value or subpattern fails to match, whereas non-validating +//! ones will skip them. In general, whether an element is validating is +//! determined by the absence of an optional `?` in the documented position, +//! with the exception of key matches, which instead use it to filter entries +//! that didn't match. //! -//! The empty pattern matches anything, but only does so once. +//! The empty pattern matches anything, but only does so once. Empty subvalues +//! are ignored. //! //! ## Syntax of Datafu Expressions //! //! Datafu Expressions follow the given syntax, in (pseudo-)extended BNF: //! //! ```text -//! expression ::= [type] [predicate] {arrow tag} {subvalue} -//! tag ::= identifier [arg] [predicate] | arg [predicate] -//! arg ::= parameter | literal | regex | keymatch +//! pattern ::= [matcher] tree +//! tree ::= {tag} {subvalue} +//! tag ::= arrow [keymatch] valuematch +//! valuematch ::= matcher | name [matcher] +//! matcher ::= parameter | literal | regex | predicate | type //! //! arrow ::= '->' -//! keymatch ::= '[' [name] expression ']' ['?'] -//! subvalue ::= '(' expression ')' ['?'] +//! keymatch ::= '[' valuematch tree ']' ['?'] +//! subvalue ::= '(' tree ')' ['?'] //! ``` //! -//! For a description of the terminals "parameter", "literal", "regex" and -//! "predicate", see "Syntax Elements of Datafu Expressions" above. +//! For a description of the terminals "parameter", "literal", "regex", "type" +//! and "predicate", see "Syntax Elements of Datafu Expressions" above. //! //! # Examples //! @@ -116,7 +103,7 @@ //! {"a": {"1": {"y": true}, "2": {"x": true, "y": true}}} //! ``` //! -//! Produces the results for the sub-JSON +//! Produces the same results as if matched against the sub-JSON //! //! ```json //! {"a": {"2": {"x": true, "y": true}}} diff --git a/src/parser.rs b/src/parser.rs index a11b68c..eb378ad 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -877,5 +877,17 @@ mod tests { run_pattern(":map->[:str]:str"); run_pattern(":map(->[:str]:str)()"); } + + #[test] + fn test_documented_edge_cases() { + // tests that these patterns parse successfully + fn run_pattern(mut s: &str) { + let _ = prep_parser(s).pattern(&mut s).unwrap(); + } + // Empty pattern. + run_pattern(""); + // Empty subvalue + run_pattern("()"); + } } diff --git a/src/vm/de.rs b/src/vm/de.rs index 493b658..8f2aa8d 100644 --- a/src/vm/de.rs +++ b/src/vm/de.rs @@ -169,6 +169,7 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> { matches: true, poison: false, }; + debug_assert!(!new_frame.ops.is_empty()); // we want the "newest" frame last, so it is // easier to unwind back. self.interp.frames.insert(at, new_frame); @@ -209,12 +210,13 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> { pack_index -= 1; } if frame.poison { - if has_pack { - packs.remove(pack_index); - } - frame.matches = false; - has_pack = false; + // FIXME should frame.poison always remove the pack? if frame.is_value() { + if has_pack { + packs.remove(pack_index); + } + frame.matches = false; + has_pack = false; frame.poison = false; } } @@ -260,8 +262,14 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> { packs[target_pack].merge_breadth(pack); } } else { + //if frame.poison { + // target_frame.poison = true; + //} if !optional { - target_frame.poison = true; + self.interp.error.insert({ + MatchError::ValidationError + }); + return Err(E::custom("subtree failed")); } } if let Some((0, _)) = target_frame.num_subtrees() { @@ -864,10 +872,14 @@ where } } } + let mut poison = false; for (f, m) in self.frames_mut().iter_active_mut().zip(output_matches) { - // FIXME inspect frame.key_subtree() for optional? - // what is this even supposed to do again? f.matches = m; + if !m { + if let Some((_, false)) = f.key_subtree() { + poison = true; + } + } } let obj = SerdeObject::Map(obj_inner); let mut final_packs = self.step_out(output_packs)?; @@ -880,6 +892,7 @@ where | Some((Type::IgnoredAny, _)) | None => { + frame.poison = poison; let matched = std::mem::replace(&mut frame.matches, true); if !matched { final_packs.insert( @@ -1360,6 +1373,36 @@ mod tests { assert_eq!(pack.subpacks.len(), 2); } + #[test] + fn test_empty_subtrees() { + // tests empty subtrees + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + ":map()(())", + None, + None, + ).unwrap(); + dbg!(&consts); + let data = r#"{"hello": "world"}"#; + let mut der = JsonDeserializer::from_str(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + let (mut packs, obj) = result.unwrap(); + assert!(obj.is_none()); + assert_eq!(packs.len(), 1); + let pack = packs.pop().unwrap(); + assert_eq!(pack.subpacks.len(), 0); + } + #[test] fn test_parser_subtrees() { // use a parsed pattern with subtrees to test Packer @@ -1402,15 +1445,165 @@ mod tests { } #[test] - fn test_subtree_missing() { - // use a parsed pattern that might actually be used in the real world. + fn test_parser_subtrees_validation() { + // checks that the Packer can validate multiple keys. + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + ":map(->['name'?]name:str)(->['value'?]value:u32)(->[:str]:?ignored_any)", + None, + None, + ).unwrap(); + let data = &[ + 0x02, // map length (2) + 0x04, // string length (4) + 0x6E, 0x61, 0x6D, 0x65, // b'name' + 0x01, // string length (1) + 0x61, // b'a' + 0x05, // string length (5) + 0x76, 0x61, 0x6C, 0x75, 0x65, // b'value' + 0x01, // 1 + ]; + let mut der = PostcardDeserializer::from_bytes(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + let (mut packs, obj) = result.unwrap(); + assert!(obj.is_none()); + assert_eq!(packs.len(), 1); + let pack = packs.pop().unwrap(); + assert_eq!(pack.subpacks.len(), 1); + assert_eq!(pack.subpacks[0]["name"].1, SerdeObject::Str(From::from("a"))); + assert_eq!(pack.subpacks[0]["value"].1, SerdeObject::U32(1)); + } + + #[test] + fn test_one_or_more_subtrees_validation() { + // checks that the Packer supports OR-like validation semantics + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + ":map((->['name'?]?name)?(->['value'?]?value)?)(->[:str]:u32)", + None, + None, + ).unwrap(); + let data = &[ + 0x01, // map length (1) + 0x04, // string length (4) + 0x6E, 0x61, 0x6D, 0x65, // b'name' + 0x01, // 1 + ]; + let mut der = PostcardDeserializer::from_bytes(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + let (mut packs, obj) = result.unwrap(); + assert!(obj.is_none()); + assert_eq!(packs.len(), 1); + let pack = packs.pop().unwrap(); + assert_eq!(pack.subpacks.len(), 1); + assert_eq!(pack.subpacks[0]["name"].1, SerdeObject::U32(1)); + } + + #[test] + fn test_one_or_more_subtrees_validation_failure() { + // checks that the Packer supports OR-like validation semantics + // (failure) + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + ":map((->['name'?]?name)?(->['value'?]?value)?)(->[:str]:u32)", + None, + None, + ).unwrap(); + let data = &[ + 0x01, // map length (1) + 0x04, // string length (4) + 0x6E, 0x65, 0x6D, 0x65, // b'neme' + 0x01, // 1 + ]; + let mut der = PostcardDeserializer::from_bytes(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + assert!(matches!(err, Some(MatchError::ValidationError))); + assert!(result.is_err()); + } + + // FIXME this should test that the map doesn't contain other keys aside + // from the ones requested by the pattern, but this doesn't match existing + // datafu semantics at all. we need new syntax for it. + //#[test] + //fn test_parser_subtrees_no_additional_keys() { + // // use a parsed pattern with subtrees to test Packer + // // also test a non-self-describing format (postcard) + // // also require at least one subtree to match on every iteration. + // // also this checks for validation error + // let consts = crate::parser::parse::<&'static str, &'static str, ()>( + // ":map((->['name'?]name)?(->['value'?]value)?)(->[:str]:u32)", + // None, + // None, + // ).unwrap(); + // let data = &[ + // 0x03, // map length (3) + // 0x04, // string length (4) + // 0x6E, 0x61, 0x6D, 0x65, // b'name' + // 0x01, // 1 + // 0x05, // string length (5) + // 0x76, 0x61, 0x6C, 0x75, 0x65, // b'value' + // 0x01, // 1 + // 0x05, // string length (5) + // 0x76, 0x65, 0x6C, 0x75, 0x65, // b'velue' + // 0x01, // 1 + // ]; + // let mut der = PostcardDeserializer::from_bytes(data); + // let mut err = Default::default(); + // let mut frames = Default::default(); + // let interp = Interpreter::new( + // &consts, + // &mut err, + // &mut frames, + // //&mut output, + // ); + // let result = Packer::new( + // interp, + // MAX_CALLS, + // ).deserialize(&mut der); + // assert!(matches!(err, Some(MatchError::ValidationError))); + // assert!(result.is_err()); + //} + + #[test] + fn test_key_optionally_missing() { + // test a pattern with a missing key let consts = crate::parser::parse::<&'static str, &'static str, ()>( " :map ->['a'?]:map ->[b:?str]:?map - (->['x'?]x:?bool) - (->['y'?]y:?bool)? + (->['x'?]x:?bool)? + (->['y'?]?y:?bool)? ", None, None @@ -1433,7 +1626,6 @@ mod tests { assert!(obj.is_none()); assert_eq!(packs.len(), 1); let pack = &packs[0]; - dbg!(pack); assert_eq!(pack.subpacks.len(), 1); let b = &pack.subpacks[0]["b"]; assert_eq!(b.1, SerdeObject::Str(From::from("2"))); @@ -1442,17 +1634,84 @@ mod tests { assert_eq!(b.0.subpacks[0]["y"].1, SerdeObject::Bool(true)); } + #[test] + fn test_map_without_values() { + // test that the map without values still collects the key + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + " + :map + ->[a:?str]?:?map + ->[b:?str]?:?map + ", + None, + None + ).unwrap(); + let data = r#"{"a": {}}"#; + let mut der = JsonDeserializer::from_str(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + let (mut packs, obj) = result.unwrap(); + assert!(obj.is_none()); + assert_eq!(packs.len(), 1); + let pack = &packs[0]; + assert_eq!(pack.subpacks.len(), 1); + let a = &pack.subpacks[0]["a"]; + assert_eq!(a.1, SerdeObject::Str(From::from("a"))); + assert_eq!(a.0.subpacks.len(), 0); + } + + #[test] + fn test_guaranteed_at_least_once() { + // test that at least one match is required before collecting the key + let consts = crate::parser::parse::<&'static str, &'static str, ()>( + " + :map + ->[a:?str]:?map + ->[b:?str]:?map + ", + None, + None + ).unwrap(); + let data = r#"{"a": {}}"#; + let mut der = JsonDeserializer::from_str(data); + let mut err = Default::default(); + let mut frames = Default::default(); + let interp = Interpreter::new( + &consts, + &mut err, + &mut frames, + //&mut output, + ); + let result = Packer::new( + interp, + MAX_CALLS, + ).deserialize(&mut der); + let (mut packs, obj) = result.unwrap(); + assert!(obj.is_none()); + assert_eq!(packs.len(), 0); + } + #[test] fn test_realish_use_case() { // use a parsed pattern that might actually be used in the real world. let consts = crate::parser::parse::<&'static str, &'static str, ()>( " :map - ->['projects'?]?:map - ->[commit:?str]?:?map - ->[url:?str]?:?map - ->[branch:?str]?:?map - (->['active'?]?active:?bool) + ->['projects'?]:map + ->[commit:?str]:?map + ->[url:?str]:?map + ->[branch:?str]:?map + (->['active'?]active:?bool)? (->['federate'?]?federate:?bool)? ", None, diff --git a/src/vm/mod.rs b/src/vm/mod.rs index 81131c0..21fec73 100644 --- a/src/vm/mod.rs +++ b/src/vm/mod.rs @@ -55,7 +55,7 @@ impl Default for PatternConstants { } } -impl std::fmt::Debug for PatternConstants { +impl std::fmt::Debug for PatternConstants { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct( "PatternConstants" @@ -70,7 +70,7 @@ impl std::fmt::Debug for PatternConstants { &format_args!("({} predicates)", self.predicates.len()), ).field( "defs", - &format_args!("FIXME"), + &self.defs, ).finish() } } -- cgit 1.4.1