diff options
-rw-r--r-- | Cargo.toml | 3 | ||||
-rw-r--r-- | src/errors.rs | 6 | ||||
-rw-r--r-- | src/lib.rs | 7 | ||||
-rw-r--r-- | src/pattern.rs | 15 | ||||
-rw-r--r-- | src/type_tree.rs | 70 | ||||
-rw-r--r-- | src/vm/de.rs | 135 | ||||
-rw-r--r-- | src/vm/mod.rs (renamed from src/vm.rs) | 97 |
7 files changed, 317 insertions, 16 deletions
diff --git a/Cargo.toml b/Cargo.toml index 1a85a35..a9e6de1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ homepage = "https://soniex2.github.io/ganarchy/project/c0b4a8a326a320ac33c5d9d6b [dependencies] regex = "1" impl_trait = "0.1.7" +serde_transmute = "0.1.4" serde = "1.0.140" erased-serde = "0.3.21" @@ -21,7 +22,7 @@ erased-serde = "0.3.21" proptest = "1.0.0" serde_json = "1.0.82" serde = {version = "1.0.140", features = ["derive"]} -charx = "1" +charx = "1.0.0" [features] default = ['stable'] diff --git a/src/errors.rs b/src/errors.rs index b41b225..9b0025e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -39,11 +39,11 @@ pub enum PatternError<'a> { // /// These are errors that may be returned by the matcher when matching a // /// pattern. // #[derive(Clone, Debug)] -// pub enum MatchError { +pub enum MatchError { // /// Returned if the pattern nests too deeply. // StackOverflow, // /// Returned if the pattern rejects the input. -// ValidationError, + ValidationError, // /// Returned if the pattern attempts an unsupported operation. // /// // /// In particular, if the [`PatternTypes`] doesn't support `get` or `pairs` @@ -52,4 +52,4 @@ pub enum PatternError<'a> { // UnsupportedOperation, // /// Returned if an unspecified non-critical error occurred. // Other -// } +} diff --git a/src/lib.rs b/src/lib.rs index a3a6e1e..f71d81c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,8 @@ // Copyright (C) 2021-2022 Soni L. // SPDX-License-Identifier: MIT OR Apache-2.0 +#![warn(elided_lifetimes_in_paths)] + //! Datafu is a regex-inspired query language. It was primarily //! designed for processing object trees parsed from configuration files, but //! can be used with anything that supports serde. @@ -99,6 +101,7 @@ //! <!-- TODO --> pub mod errors; +pub mod type_tree; mod parser; mod pattern; mod vm; @@ -107,7 +110,7 @@ pub use pattern::Pattern; /// A predicate. pub type Predicate = dyn (Fn( - &mut dyn erased_serde::Deserializer<> + &mut dyn erased_serde::Deserializer<'_> ) -> bool) + Send + Sync; /// Helper to build predicates because closure inference is the worst. @@ -133,7 +136,7 @@ pub type Predicate = dyn (Fn( pub fn pred<F>(f: F) -> Box<Predicate> where F: (Fn( - &mut dyn erased_serde::Deserializer<> + &mut dyn erased_serde::Deserializer<'_> ) -> bool) + Send + Sync + 'static, { Box::new(f) diff --git a/src/pattern.rs b/src/pattern.rs index 2e69714..fc3c8a7 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -6,16 +6,17 @@ use std::borrow::Borrow; use std::collections::BTreeMap; -use serde::Deserialize; -use serde::Deserializer; -use serde::Serialize; +use serde::de::Deserialize; +use serde::de::DeserializeSeed; +use serde::de::Deserializer; +use serde::ser::Serialize; use crate::Predicate; use crate::errors::PatternError; use crate::parser::parse; -//use crate::vm::Matcher; +use crate::vm; use crate::vm::PatternConstants; -//use crate::vm::MAX_CALLS; +use crate::vm::MAX_CALLS; /// A compiled Datafu pattern. /// @@ -83,11 +84,13 @@ impl<O: Serialize> Pattern<O> { } /// Matches the pattern against an input. - pub fn deserialize<'de, Der, De>(&self, de: Der) -> Result<De, Der::Error> + pub fn deserialize<'de, Der, De>(&self, der: Der) -> Result<De, Der::Error> where Der: Deserializer<'de>, De: Deserialize<'de>, { + let pack = vm::Packer::new(&self.consts, MAX_CALLS).deserialize(der)?; + let de = De::deserialize(vm::Unpacker::new(pack, MAX_CALLS)); todo!() } } diff --git a/src/type_tree.rs b/src/type_tree.rs new file mode 100644 index 0000000..8e8098b --- /dev/null +++ b/src/type_tree.rs @@ -0,0 +1,70 @@ +// Copyright (C) 2021-2022 Soni L. +// SPDX-License-Identifier: MIT OR Apache-2.0 + +//! Type Tree support. +//! +//! Type Trees are a Datafu feature for extracting types from a `serde`-based +//! `Deserialize` in such a way that it can be used with Datafu patterns. +//! +//! They work by matching the `Deserialize` against some data, with the help of +//! `serde_transmute`. Datafu then collects the relevant `Deserialize` calls, +//! and uses them to infer an appropriate type tree for dynamic +//! deserialization. +//! +//! When introspecting the `Deserialize`, all matching parts are extracted, and +//! non-matching parts are ignored. Even if an error occurs, Datafu will gladly +//! infer a type tree for what it could match. +//! +//! For example, given a struct and the corresponding data: +//! +//! ``` +//! struct Foo { +//! bar: i32, +//! } +//! +//! let data = Foo { bar: 0 }; +//! ``` +//! +//! Building a type tree will first inspect the struct like so: +//! +//! 1. call `deserialize()` on `Foo`. +//! 2. inspect the `deserialize_struct` from `Foo`, storing the name and +//! fields. +//! 3. give `Foo` the appropriate visitor (from `data`), through +//! `serde_transmute`. +//! 4. inspect the `deserialize_i32` etc, also storing those. +//! +//! The resulting type tree can then be used in any pattern to effectively +//! match a `Foo`, but more efficiently than with a predicate. Another big +//! difference between predicates and type trees is how predicates are eager, +//! and can consume values that would otherwise be matched by the rest of a +//! pattern. +//! +//! Type trees are pretty flexible. Consider the following example: +//! +//! ``` +//! struct Foo { +//! bar: Vec<u32>, +//! } +//! +//! let data = Foo { bar: vec![1, 2, 3] }; +//! ``` +//! +//! This will actually produce a type tree which checks that the first 3 items +//! are `u32`! Further, when using different types for the predicate and the +//! data, you can get even more flexiblity. For example, with the following +//! struct and data: +//! +//! ``` +//! struct Foo { +//! bar: Vec<u32>, +//! } +//! +//! let data = (); +//! ``` +//! +//! Datafu will actually inspect the `deserialize_struct`, and then the +//! struct visitor will error. But despite the error, it'll still create a type +//! tree for the `deserialize_struct`! + +// TODO diff --git a/src/vm/de.rs b/src/vm/de.rs index 7039ea9..4d0d097 100644 --- a/src/vm/de.rs +++ b/src/vm/de.rs @@ -1,6 +1,139 @@ // Copyright (C) 2022 Soni L. // SPDX-License-Identifier: MIT OR Apache-2.0 -use crate::vm; +//! Deserialization-related parts of the VM. +use serde::Serialize; +use serde::de::Error as _; +use super::PatternConstants; +use super::PatternElement; +use super::Pack; + +/// A `DeserializeSeed` for Datafu input. +/// +/// This converts from Serde to Datafu's internal representation (a "pack"). +pub struct Packer<'pat, O: Serialize> { + /// The pattern currently being processed. + pat: &'pat PatternConstants<O>, + /// The instructions/function currently being processed. + ops: &'pat [PatternElement], + /// Maximum number of calls. + call_limit: usize, +} + +impl<'pat, O: Serialize> Packer<'pat, O> { + pub(crate) fn new( + pat: &'pat PatternConstants<O>, + call_limit: usize, + ) -> Self { + Self { + pat, call_limit, ops: &pat.protos.last().unwrap()[..], + } + } +} + +impl<'pat, 'de, O> serde::de::DeserializeSeed<'de> for Packer<'pat, O> +where + O: Serialize, +{ + type Value = Pack; + fn deserialize<D>(self, deserializer: D) -> Result<Pack, D::Error> + where + D: serde::Deserializer<'de> + { + // check the first op + let first = self.ops.first(); + match first { + Some(PatternElement::ApplyPredicate(id, skippable)) => { + let predicate = &self.pat.predicates[*id]; + let ok = predicate(todo!()); + match (ok, skippable) { + (true, _) => { + todo!() + }, + (false, false) => { + return Err(D::Error::custom("predicate didn't match")); + }, + (false, true) => { + todo!() + }, + } + }, + _ => { + dbg!(first); + todo!() + }, + } + } +} + +/// A `Deserializer` for Datafu output. +/// +/// This converts from Datafu's internal representation (a "pack") into the +/// desired output type. +pub struct Unpacker { + pack: Pack, + call_limit: usize, +} + +impl Unpacker { + /// Unpacks a Datafu "pack". + pub fn new(pack: Pack, call_limit: usize) -> Self { + Self { + pack, call_limit, + } + } +} + +impl<'de> serde::Deserializer<'de> for Unpacker { + // TODO datafu errors + type Error = serde::de::value::Error; + fn deserialize_any<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_bool<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_i8<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_i16<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_i32<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_i64<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_u8<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_u16<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_u32<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_u64<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_f32<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_f64<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_char<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_str<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_string<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_bytes<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_byte_buf<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_option<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_unit<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_unit_struct<V>(self, _: &'static str, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_newtype_struct<V>(self, _: &'static str, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_seq<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_tuple<V>(self, _: usize, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_tuple_struct<V>(self, _: &'static str, _: usize, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_map<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_struct<V>( + self, + _: &'static str, + fields: &'static [&'static str], + visitor: V, + ) -> Result<V::Value, Self::Error> + where + V: serde::de::Visitor<'de>, + { + todo!() + } + fn deserialize_enum<V>(self, _: &'static str, _: &'static [&'static str], _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_identifier<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } + fn deserialize_ignored_any<V>(self, _: V) -> Result<V::Value, Self::Error> where V: serde::de::Visitor<'de> { todo!() } +} + +/// A Deserializer for collecting matches from [`crate::Predicate`]s. +/// +/// What are we doing? +/// +/// We certainly have regrets. +pub struct PredicateCollector { +} diff --git a/src/vm.rs b/src/vm/mod.rs index ac5c95d..92a99d7 100644 --- a/src/vm.rs +++ b/src/vm/mod.rs @@ -13,6 +13,9 @@ use crate::Predicate; mod de; +pub use de::Unpacker; +pub use de::Packer; + /// Max depth for VM/serde recursion. pub(crate) const MAX_CALLS: usize = 250; @@ -46,8 +49,28 @@ impl<O: Serialize> Default for PatternConstants<O> { } } +impl<O: Serialize> std::fmt::Debug for PatternConstants<O> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct( + "PatternConstants" + ).field( + "protos", &self.protos, + ).field( + "strings", &self.strings, + ).field( + "regices", &self.regices, + ).field( + "predicates", + &format_args!("({} predicates)", self.predicates.len()), + ).field( + "defs", + &format_args!("FIXME"), + ).finish() + } +} + /// A pattern element. -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] pub(crate) enum PatternElement { Arrow, @@ -58,11 +81,79 @@ pub(crate) enum PatternElement { ParameterKey(usize, bool), KeySubtree(usize, bool), ValueSubtree(usize, bool), - ApplyPredicate(usize, bool), - End + /// Represents a predicate which must be applied. + /// + /// These are custom, arbitrary predicates, powered by serde. They're + /// represented by `:$foo` in a pattern. + ApplyPredicate( + /// The predicate index (in `PatternConstants.predicates`). + usize, + /// Whether to skip non-matching values, instead of erroring. + bool, + ), + + /// Represents a type expectation. + /// + /// These are similar to predicates. They're represented by `:foo`, but are + /// built-in and provide functionality not supported by predicates. + /// + /// Specifically, predicates cannot ask serde for a map or a list directly. + /// Instead, they'd be required to parse a whole map/list/etc, which could + /// cause issues which datafu is designed to avoid. (Datafu is designed to + /// resist malicious input more so than arbitrary serde deserializers.) + Type( + /// The expected type. + Type, + /// Whether to skip non-matching values, instead of erroring. + bool, + ), + + End, +} + +/// The types datafu and serde currently support. +/// +/// These are used as expectations for serde (e.g. +/// `Deserializer::deserialize_string`). +#[derive(Copy, Clone, Debug)] +pub(crate) enum Type { + Bool, + I8, + I16, + I32, + I64, + I128, + U8, + U16, + U32, + U64, + U128, + F32, + F64, + Char, + Str, + String, + Bytes, + ByteBuf, + Option, + Unit, + Seq, + Tuple(usize), + Map, + // these aren't really supported: + // UnitStruct, UnitVariant, NewtypeStruct, NewtypeVariant, TupleStruct, + // TupleVariant, Struct, StructVariant + // instead we use type trees for that. + /// Adapter for Type Trees. See `crate::type_tree` for more details. + Of { + /// The type tree index (in `PatternConstants.type_trees`). + type_tree: usize, + }, } +pub struct Pack; + //struct Frame<'a, 'b, T: PatternTypes> { // //obj: RefOwn<'b, T::Ref, T::Own>, // ops: &'a [PatternElement], |