summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorSoniEx2 <endermoneymod@gmail.com>2022-11-15 08:23:02 -0300
committerSoniEx2 <endermoneymod@gmail.com>2022-11-15 08:23:02 -0300
commit6dd30531ac62f6a3a564b7341d43f6cd71b90794 (patch)
tree576981a864592260da0165c585f66f9406f17d27
parent72397506c3529f4878b5a1cf8205599764ac4088 (diff)
Rework subtrees again and update docs
-rw-r--r--src/lib.rs127
-rw-r--r--src/parser.rs12
-rw-r--r--src/vm/de.rs295
-rw-r--r--src/vm/mod.rs4
4 files changed, 348 insertions, 90 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 6d8a7e0..824d54e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,89 +14,76 @@
 //!
 //! ## Syntax Elements of Datafu Expressions
 //!
-//! FIXME still need to update these...
-//!
-//! An arrow is `->` and indicates indexing/iteration. Whether indexing or
-//! iteration is used is defined by the elements that follow, with iteration
-//! being used by default.
-//!
-//! A variable is a sequence of alphanumeric characters, not starting with
-//! a digit. The value of the matched element will be identified by this name.
-//!
-//! A literal is a sequence of characters delimited by `'`, optionally
-//! followed by `?`, with `%` as the escape character, and defines a
-//! string-keyed indexing operation. A literal can contain any character,
-//! except unescaped `%` or `'` symbols, which must be escaped as
-//! `%%` and `%'`, respectively. The sequence of characters defined by
-//! a literal is used as the string object in the indexing operation.
-//!
-//! A parameter is `$`, optionally followed by `?`, followed by a
-//! sequence of alphanumeric characters, not starting with a digit, and
-//! defines an object-keyed indexing operation. The sequence of characters
-//! defined by a parameter is used to retrieve, from the pattern's
-//! definitions, the object to be used in the indexing operation.
-//!
-//! A regex is a sequence of characters delimited by `/`, optionally
-//! followed by `?`, with `%` as the escape character. A regex can
-//! contain any character, except unescaped `%` or `/` symbols, which
-//! must be escaped as `%%` and `%/`, respectively. The sequence of
-//! characters defined by a regex is passed to the `regex` crate, which
-//! may apply further restrictions on the characters used, and is used to
-//! accept the respective keys processed by the iterator.
-//!
-//! A predicate is `:`, optionally followed by `?`, followed by an
-//! `$` and a sequence of alphanumeric characters, not starting with a
-//! digit, and is used to accept values to be processed based on an
-//! external [`Predicate`].
-//!
-//! A key match is a datafu expression (including, but not limited to, the
-//! empty datafu expression) enclosed within `[` and `]`, optionally
-//! prefixed with an identifier and zero or more predicates, and applies the
-//! enclosed predicates and datafu expression to the key (or index) being
-//! processed. A key match enables additional validation of keys and/or
-//! extraction of values from keys, and accepts a key if and only if the
-//! enclosed predicates accept the key and the enclosed expression matches the
-//! key. The matched key is stored in the identifier.
-//!
-//! A subvalue is a datafu expression (including, but not limited to, the
-//! empty datafu expression) enclosed within `(` and `)`, and applies
-//! the enclosed datafu expression to the value (or index) being processed.
-//! A subvalue enables the ability to match multiple values on the same
-//! object, and accepts a value if and only the enclosed expression
-//! matches the value. A subvalue can be made optional by the presence of
-//! a `?` after the subvalue - in case of no match, it will just omit
-//! the relevant keys in the result. Optional subvalues are unrelated to
-//! non-validating syntax elements (see below), they just use the same
-//! syntax.
+//! A datafu pattern starts with an optional value matcher and is otherwise a
+//! sequence of iterative elements (see "arrow" below) followed by subvalues.
+//!
+//! A datafu pattern is composed of the following elements:
+//!
+//! 1. An arrow, `->` indicates iteration. (Note that datafu operates directly
+//!     on serde and deserialization is an iterative process.)
+//!
+//!     Always followed by a matcher, or a name and an optional matcher.
+//! 2. A name is a sequence of alphanumeric characters, not starting with a
+//!     digit. A name collects a matched value. The value will be identified by
+//!     the name.
+//! 3. A matcher is one of the following 5 elements.
+//! 4. A literal is a quoted string delimited by `'` with `%` as the escape
+//!     character. A literal can contain any Unicode scalar value, and the only
+//!     allowed escape codes are `%%` and `%'`, for `%` and `'` respectively,
+//!     which must be escaped. A literal matches a string value and can be
+//!     optional.
+//! 5. A parameter is `$`, optionally followed by `?`, followed by a sequence
+//!     of alphanumeric characters, not starting with a digit. This is
+//!     currently unimplemented.
+//! 6. A regex is a quoted string delimited by `/`, with `%` as the escape
+//!     character. A regex can contain any Unicode scalar value, and the only
+//!     allowed escape codes are `%%` and `%/`, for `%` and `/` respectively,
+//!     which must be escaped. A regex element matches a string value against
+//!     the regex it represents, and can be optional.
+//! 7. A predicate is `:`, optionally followed by `?`, followed by `$` and a
+//!     sequence of alphanumeric characters, not starting with a digit. This is
+//!     currently unimplemented.
+//! 8. A type is `:`, optionally followed by `?`, followed by one of the
+//!     following keywords: TODO
+//! 9. A key match is a sequence of iterative elements followed by subvalues,
+//!     prefixed by either a matcher or a name and an optional matcher,
+//!     and enclosed within `[` and `]`, optionally followed by `?`. A key
+//!     match applies to map keys and sequence indices, and aside from the
+//!     previously mentioned requirement is otherwise just a datafu pattern.
+//! 10. A subvalue is a sequence of iterative elements followed by subvalues,
+//!     enclosed within `(` and `)`, optionally followed by `?`. A sequence of
+//!     subvalues enables matching distinct patterns on the same value, like
+//!     distinct fields of a struct.
 //!
 //! Some syntax elements can be validating or non-validating. Validating
 //! syntax elements will return a [`errors::MatchError::ValidationError`]
-//! whenever a non-accepted element is encountered, whereas non-validating
-//! ones will skip them. Whether an element is validating is determined by
-//! the absence of an optional `?` in the documented position. Note that
-//! it is possible for a validating syntax element to still yield results
-//! before returning a [`errors::MatchError::ValidationError`], so one
-//! needs to be careful when writing code where such behaviour could
-//! result in a security vulnerability.
+//! whenever a value or subpattern fails to match, whereas non-validating
+//! ones will skip them. In general, whether an element is validating is
+//! determined by the absence of an optional `?` in the documented position,
+//! with the exception of key matches, which instead use it to filter entries
+//! that didn't match.
 //!
-//! The empty pattern matches anything, but only does so once.
+//! The empty pattern matches anything, but only does so once. Empty subvalues
+//! are ignored.
 //!
 //! ## Syntax of Datafu Expressions
 //!
 //! Datafu Expressions follow the given syntax, in (pseudo-)extended BNF:
 //!
 //! ```text
-//! expression ::= [type] [predicate] {arrow tag} {subvalue}
-//! tag ::= identifier [arg] [predicate] | arg [predicate]
-//! arg ::= parameter | literal | regex | keymatch
+//! pattern ::= [matcher] tree
+//! tree ::= {tag} {subvalue}
+//! tag ::= arrow [keymatch] valuematch
+//! valuematch ::= matcher | name [matcher]
+//! matcher ::= parameter | literal | regex | predicate | type
 //!
 //! arrow ::= '->'
-//! keymatch ::= '[' [name] expression ']' ['?']
-//! subvalue ::= '(' expression ')' ['?']
+//! keymatch ::= '[' valuematch tree ']' ['?']
+//! subvalue ::= '(' tree ')' ['?']
 //! ```
 //!
-//! For a description of the terminals "parameter", "literal", "regex" and
-//! "predicate", see "Syntax Elements of Datafu Expressions" above.
+//! For a description of the terminals "parameter", "literal", "regex", "type"
+//! and "predicate", see "Syntax Elements of Datafu Expressions" above.
 //!
 //! # Examples
 //!
@@ -116,7 +103,7 @@
 //! {"a": {"1": {"y": true}, "2": {"x": true, "y": true}}}
 //! ```
 //!
-//! Produces the results for the sub-JSON
+//! Produces the same results as if matched against the sub-JSON
 //!
 //! ```json
 //! {"a": {"2": {"x": true, "y": true}}}
diff --git a/src/parser.rs b/src/parser.rs
index a11b68c..eb378ad 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -877,5 +877,17 @@ mod tests {
         run_pattern(":map->[:str]:str");
         run_pattern(":map(->[:str]:str)()");
     }
+
+    #[test]
+    fn test_documented_edge_cases() {
+        // tests that these patterns parse successfully
+        fn run_pattern(mut s: &str) {
+            let _ = prep_parser(s).pattern(&mut s).unwrap();
+        }
+        // Empty pattern.
+        run_pattern("");
+        // Empty subvalue
+        run_pattern("()");
+    }
 }
 
diff --git a/src/vm/de.rs b/src/vm/de.rs
index 493b658..8f2aa8d 100644
--- a/src/vm/de.rs
+++ b/src/vm/de.rs
@@ -169,6 +169,7 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> {
                                 matches: true,
                                 poison: false,
                             };
+                            debug_assert!(!new_frame.ops.is_empty());
                             // we want the "newest" frame last, so it is
                             // easier to unwind back.
                             self.interp.frames.insert(at, new_frame);
@@ -209,12 +210,13 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> {
                     pack_index -= 1;
                 }
                 if frame.poison {
-                    if has_pack {
-                        packs.remove(pack_index);
-                    }
-                    frame.matches = false;
-                    has_pack = false;
+                    // FIXME should frame.poison always remove the pack?
                     if frame.is_value() {
+                        if has_pack {
+                            packs.remove(pack_index);
+                        }
+                        frame.matches = false;
+                        has_pack = false;
                         frame.poison = false;
                     }
                 }
@@ -260,8 +262,14 @@ impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> {
                                 packs[target_pack].merge_breadth(pack);
                             }
                         } else {
+                            //if frame.poison {
+                            //    target_frame.poison = true;
+                            //}
                             if !optional {
-                                target_frame.poison = true;
+                                self.interp.error.insert({
+                                    MatchError::ValidationError
+                                });
+                                return Err(E::custom("subtree failed"));
                             }
                         }
                         if let Some((0, _)) = target_frame.num_subtrees() {
@@ -864,10 +872,14 @@ where
                 }
             }
         }
+        let mut poison = false;
         for (f, m) in self.frames_mut().iter_active_mut().zip(output_matches) {
-            // FIXME inspect frame.key_subtree() for optional?
-            // what is this even supposed to do again?
             f.matches = m;
+            if !m {
+                if let Some((_, false)) = f.key_subtree() {
+                    poison = true;
+                }
+            }
         }
         let obj = SerdeObject::Map(obj_inner);
         let mut final_packs = self.step_out(output_packs)?;
@@ -880,6 +892,7 @@ where
                 | Some((Type::IgnoredAny, _))
                 | None
                 => {
+                    frame.poison = poison;
                     let matched = std::mem::replace(&mut frame.matches, true);
                     if !matched {
                         final_packs.insert(
@@ -1361,6 +1374,36 @@ mod tests {
     }
 
     #[test]
+    fn test_empty_subtrees() {
+        // tests empty subtrees
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            ":map()(())",
+            None,
+            None,
+        ).unwrap();
+        dbg!(&consts);
+        let data = r#"{"hello": "world"}"#;
+        let mut der = JsonDeserializer::from_str(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        let (mut packs, obj) = result.unwrap();
+        assert!(obj.is_none());
+        assert_eq!(packs.len(), 1);
+        let pack = packs.pop().unwrap();
+        assert_eq!(pack.subpacks.len(), 0);
+    }
+
+    #[test]
     fn test_parser_subtrees() {
         // use a parsed pattern with subtrees to test Packer
         // also test a non-self-describing format (postcard)
@@ -1402,15 +1445,165 @@ mod tests {
     }
 
     #[test]
-    fn test_subtree_missing() {
-        // use a parsed pattern that might actually be used in the real world.
+    fn test_parser_subtrees_validation() {
+        // checks that the Packer can validate multiple keys.
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            ":map(->['name'?]name:str)(->['value'?]value:u32)(->[:str]:?ignored_any)",
+            None,
+            None,
+        ).unwrap();
+        let data = &[
+            0x02, // map length (2)
+            0x04, // string length (4)
+            0x6E, 0x61, 0x6D, 0x65, // b'name'
+            0x01, // string length (1)
+            0x61, // b'a'
+            0x05, // string length (5)
+            0x76, 0x61, 0x6C, 0x75, 0x65, // b'value'
+            0x01, // 1
+        ];
+        let mut der = PostcardDeserializer::from_bytes(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        let (mut packs, obj) = result.unwrap();
+        assert!(obj.is_none());
+        assert_eq!(packs.len(), 1);
+        let pack = packs.pop().unwrap();
+        assert_eq!(pack.subpacks.len(), 1);
+        assert_eq!(pack.subpacks[0]["name"].1, SerdeObject::Str(From::from("a")));
+        assert_eq!(pack.subpacks[0]["value"].1, SerdeObject::U32(1));
+    }
+
+    #[test]
+    fn test_one_or_more_subtrees_validation() {
+        // checks that the Packer supports OR-like validation semantics
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            ":map((->['name'?]?name)?(->['value'?]?value)?)(->[:str]:u32)",
+            None,
+            None,
+        ).unwrap();
+        let data = &[
+            0x01, // map length (1)
+            0x04, // string length (4)
+            0x6E, 0x61, 0x6D, 0x65, // b'name'
+            0x01, // 1
+        ];
+        let mut der = PostcardDeserializer::from_bytes(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        let (mut packs, obj) = result.unwrap();
+        assert!(obj.is_none());
+        assert_eq!(packs.len(), 1);
+        let pack = packs.pop().unwrap();
+        assert_eq!(pack.subpacks.len(), 1);
+        assert_eq!(pack.subpacks[0]["name"].1, SerdeObject::U32(1));
+    }
+
+    #[test]
+    fn test_one_or_more_subtrees_validation_failure() {
+        // checks that the Packer supports OR-like validation semantics
+        // (failure)
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            ":map((->['name'?]?name)?(->['value'?]?value)?)(->[:str]:u32)",
+            None,
+            None,
+        ).unwrap();
+        let data = &[
+            0x01, // map length (1)
+            0x04, // string length (4)
+            0x6E, 0x65, 0x6D, 0x65, // b'neme'
+            0x01, // 1
+        ];
+        let mut der = PostcardDeserializer::from_bytes(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        assert!(matches!(err, Some(MatchError::ValidationError)));
+        assert!(result.is_err());
+    }
+
+    // FIXME this should test that the map doesn't contain other keys aside
+    // from the ones requested by the pattern, but this doesn't match existing
+    // datafu semantics at all. we need new syntax for it.
+    //#[test]
+    //fn test_parser_subtrees_no_additional_keys() {
+    //    // use a parsed pattern with subtrees to test Packer
+    //    // also test a non-self-describing format (postcard)
+    //    // also require at least one subtree to match on every iteration.
+    //    // also this checks for validation error
+    //    let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+    //        ":map((->['name'?]name)?(->['value'?]value)?)(->[:str]:u32)",
+    //        None,
+    //        None,
+    //    ).unwrap();
+    //    let data = &[
+    //        0x03, // map length (3)
+    //        0x04, // string length (4)
+    //        0x6E, 0x61, 0x6D, 0x65, // b'name'
+    //        0x01, // 1
+    //        0x05, // string length (5)
+    //        0x76, 0x61, 0x6C, 0x75, 0x65, // b'value'
+    //        0x01, // 1
+    //        0x05, // string length (5)
+    //        0x76, 0x65, 0x6C, 0x75, 0x65, // b'velue'
+    //        0x01, // 1
+    //    ];
+    //    let mut der = PostcardDeserializer::from_bytes(data);
+    //    let mut err = Default::default();
+    //    let mut frames = Default::default();
+    //    let interp = Interpreter::new(
+    //        &consts,
+    //        &mut err,
+    //        &mut frames,
+    //        //&mut output,
+    //    );
+    //    let result = Packer::new(
+    //        interp,
+    //        MAX_CALLS,
+    //    ).deserialize(&mut der);
+    //    assert!(matches!(err, Some(MatchError::ValidationError)));
+    //    assert!(result.is_err());
+    //}
+
+    #[test]
+    fn test_key_optionally_missing() {
+        // test a pattern with a missing key
         let consts = crate::parser::parse::<&'static str, &'static str, ()>(
             "
             :map
             ->['a'?]:map
               ->[b:?str]:?map
-                (->['x'?]x:?bool)
-                (->['y'?]y:?bool)?
+                (->['x'?]x:?bool)?
+                (->['y'?]?y:?bool)?
             ",
             None,
             None
@@ -1433,7 +1626,6 @@ mod tests {
         assert!(obj.is_none());
         assert_eq!(packs.len(), 1);
         let pack = &packs[0];
-        dbg!(pack);
         assert_eq!(pack.subpacks.len(), 1);
         let b = &pack.subpacks[0]["b"];
         assert_eq!(b.1, SerdeObject::Str(From::from("2")));
@@ -1443,16 +1635,83 @@ mod tests {
     }
 
     #[test]
+    fn test_map_without_values() {
+        // test that the map without values still collects the key
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            "
+            :map
+            ->[a:?str]?:?map
+              ->[b:?str]?:?map
+            ",
+            None,
+            None
+        ).unwrap();
+        let data = r#"{"a": {}}"#;
+        let mut der = JsonDeserializer::from_str(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        let (mut packs, obj) = result.unwrap();
+        assert!(obj.is_none());
+        assert_eq!(packs.len(), 1);
+        let pack = &packs[0];
+        assert_eq!(pack.subpacks.len(), 1);
+        let a = &pack.subpacks[0]["a"];
+        assert_eq!(a.1, SerdeObject::Str(From::from("a")));
+        assert_eq!(a.0.subpacks.len(), 0);
+    }
+
+    #[test]
+    fn test_guaranteed_at_least_once() {
+        // test that at least one match is required before collecting the key
+        let consts = crate::parser::parse::<&'static str, &'static str, ()>(
+            "
+            :map
+            ->[a:?str]:?map
+              ->[b:?str]:?map
+            ",
+            None,
+            None
+        ).unwrap();
+        let data = r#"{"a": {}}"#;
+        let mut der = JsonDeserializer::from_str(data);
+        let mut err = Default::default();
+        let mut frames = Default::default();
+        let interp = Interpreter::new(
+            &consts,
+            &mut err,
+            &mut frames,
+            //&mut output,
+        );
+        let result = Packer::new(
+            interp,
+            MAX_CALLS,
+        ).deserialize(&mut der);
+        let (mut packs, obj) = result.unwrap();
+        assert!(obj.is_none());
+        assert_eq!(packs.len(), 0);
+    }
+
+    #[test]
     fn test_realish_use_case() {
         // use a parsed pattern that might actually be used in the real world.
         let consts = crate::parser::parse::<&'static str, &'static str, ()>(
             "
             :map
-            ->['projects'?]?:map
-              ->[commit:?str]?:?map
-                ->[url:?str]?:?map
-                  ->[branch:?str]?:?map
-                    (->['active'?]?active:?bool)
+            ->['projects'?]:map
+              ->[commit:?str]:?map
+                ->[url:?str]:?map
+                  ->[branch:?str]:?map
+                    (->['active'?]active:?bool)?
                     (->['federate'?]?federate:?bool)?
             ",
             None,
diff --git a/src/vm/mod.rs b/src/vm/mod.rs
index 81131c0..21fec73 100644
--- a/src/vm/mod.rs
+++ b/src/vm/mod.rs
@@ -55,7 +55,7 @@ impl<O: Serialize> Default for PatternConstants<O> {
     }
 }
 
-impl<O: Serialize> std::fmt::Debug for PatternConstants<O> {
+impl<O: Serialize + std::fmt::Debug> std::fmt::Debug for PatternConstants<O> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct(
             "PatternConstants"
@@ -70,7 +70,7 @@ impl<O: Serialize> std::fmt::Debug for PatternConstants<O> {
             &format_args!("({} predicates)", self.predicates.len()),
         ).field(
             "defs",
-            &format_args!("FIXME"),
+            &self.defs,
         ).finish()
     }
 }