From dcf5620842edb8a4632cde794c8e511e075d3662 Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Mon, 10 Oct 2022 23:50:37 -0300 Subject: Initial implementation of Packer::deserialize --- src/pattern.rs | 15 +++- src/type_tree.rs | 112 ------------------------ src/vm/de.rs | 259 ++++++++++++++++++++++++++++++++----------------------- src/vm/mod.rs | 181 ++++++++++++++++++++++++++------------ 4 files changed, 285 insertions(+), 282 deletions(-) delete mode 100644 src/type_tree.rs (limited to 'src') diff --git a/src/pattern.rs b/src/pattern.rs index 0fc6acd..a0c0ec3 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -42,9 +42,18 @@ impl Pattern { De: Deserialize<'de>, { let mut err = Default::default(); - let interp = vm::Interpreter::new(&self.consts, &mut err); - let pack = vm::Packer::new(interp, MAX_CALLS).deserialize(der)?; - let de = De::deserialize(vm::Unpacker::new(pack.0, MAX_CALLS)); + let frames = Default::default(); + //let mut output = Default::default(); + let interp = vm::Interpreter::new( + &self.consts, + &mut err, + &frames, + //&mut output, + ); + let (pack, obj) = vm::Packer::new(interp, MAX_CALLS).deserialize(der)?; + // this should always be None + debug_assert!(obj.is_none()); + let de = De::deserialize(vm::Unpacker::new(pack, MAX_CALLS)); todo!() } } diff --git a/src/type_tree.rs b/src/type_tree.rs deleted file mode 100644 index e7389d5..0000000 --- a/src/type_tree.rs +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (C) 2022 Soni L. -// SPDX-License-Identifier: MIT OR Apache-2.0 - -//! Type Tree support. -//! -//! Type Trees are a Datafu feature for extracting types from a `serde`-based -//! `Deserialize` in such a way that it can be used with Datafu patterns. -//! -//! They work by matching the `Deserialize` against some data, with the help of -//! `serde_transmute`. Datafu then collects the relevant `Deserialize` calls, -//! and uses them to infer an appropriate type tree for dynamic -//! deserialization. -//! -//! When introspecting the `Deserialize`, all matching parts are extracted, and -//! non-matching parts are ignored. Even if an error occurs, Datafu will gladly -//! infer a type tree for what it could match. -//! -//! For example, given a struct and the corresponding data: -//! -//! ``` -//! struct Foo { -//! bar: i32, -//! } -//! -//! let data = Foo { bar: 0 }; -//! ``` -//! -//! Building a type tree will first inspect the struct like so: -//! -//! 1. call `deserialize()` on `Foo`. -//! 2. inspect the `deserialize_struct` from `Foo`, storing the name and -//! fields. -//! 3. give `Foo` the appropriate visitor (from `data`), through -//! `serde_transmute`. -//! 4. inspect the `deserialize_i32` etc, also storing those. -//! -//! The resulting type tree can then be used in any pattern to effectively -//! match a `Foo`, but more efficiently than with a predicate. Another big -//! difference between predicates and type trees is how predicates are eager, -//! and can consume values that would otherwise be matched by the rest of a -//! pattern, like `IgnoredAny`. -//! -//! Type trees are pretty flexible. Consider the following example: -//! -//! ``` -//! struct Foo { -//! bar: Vec, -//! } -//! -//! let data = Foo { bar: vec![1, 2, 3] }; -//! ``` -//! -//! This will actually produce a type tree which checks that the first 3 items -//! are `u32`! Further, when using different types for the predicate and the -//! data, you can get even more flexiblity. For example, with the following -//! struct and data: -//! -//! ``` -//! struct Foo { -//! bar: Vec, -//! } -//! -//! let data = (); -//! ``` -//! -//! Datafu will actually inspect the `deserialize_struct`, and then the -//! struct visitor will error. But despite the error, it'll still create a type -//! tree for the `deserialize_struct`! - -// use serde::Deserializer; - -// /// A deserializer which attempts to fill in a type tree. -// struct TypeTreeDeserializer<'tt, D> { -// inner: D, -// tt: &'tt mut TypeTreeNode, -// } - -/// A Type Tree entry type. -/// -/// This represents a type to be deserialized with Serde, with everything that -/// comes with that. It supports the 29 core Serde types, and 2 self-describing -/// ones. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Default)] -pub enum TypeTreeType { - /// An open type, which can be anything. - /// - /// This represents [`Deserializer::deserialize_any`]. - #[default] - Any, - /// A type for a value which will be ignored. - /// - /// This represents [`Deserializer::deserialize_ignored_any`]. - IgnoredAny, - Bool, -} - -/// A node of a type tree. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Default)] -pub struct TypeTreeNode { - /// The type to be requested for this node. - pub node_type: TypeTreeType, - /// The types for when this node is an enum. - pub enum_nodes: (), // TODO - /// The types for when this node is a map. - pub map_nodes: (), // TODO - /// The types for when this node is a seq. - pub seq_nodes: (), // TODO - /// The type for when this node is a some. - pub some_node: Option>, - /// The type for when this node is a newtype struct. - pub newtype_node: Option>, -} diff --git a/src/vm/de.rs b/src/vm/de.rs index 2282484..640b1aa 100644 --- a/src/vm/de.rs +++ b/src/vm/de.rs @@ -10,11 +10,13 @@ use indexmap::IndexMap; use serde::Serialize; use serde::de::Error as _; +use serde::de::IntoDeserializer as _; use smallvec::SmallVec; use these::These; +use super::Frame; use super::Interpreter; use super::Pack; use super::PatternConstants; @@ -33,56 +35,63 @@ pub(crate) struct Packer<'pat, 'state, O: Serialize> { call_limit: usize, /// Whether we're collecting values. collecting: bool, - /// Instructions currently being processed. - ops: SmallVec<[InstructionReg<'pat>; 1]>, } -/// Instruction currently being processed. -struct InstructionReg<'pat> { - /// The (current) program sequence. - instructions: &'pat [PatternElement], - /// Whether this instruction is required to match. - required: bool, +struct FramesMut<'packer, 'pat> { + frames: std::cell::RefMut<'packer, Vec>>, +} + +struct Frames<'packer, 'pat> { + frames: std::cell::Ref<'packer, Vec>>, +} + +impl<'packer, 'pat> FramesMut<'packer, 'pat> { + fn iter_mut<'a>(&'a mut self) -> impl Iterator> where 'packer: 'a { + self.frames.iter_mut() + } + + fn iter_active_mut<'a>(&'a mut self) -> impl Iterator> where 'packer: 'a { + self.iter_mut().filter(|frame| { + frame.matches + }) + } +} + +impl<'packer, 'pat> Frames<'packer, 'pat> { + fn iter<'a>(&'a self) -> impl Iterator> where 'packer: 'a { + self.frames.iter() + } + + fn iter_active<'a>(&'a self) -> impl Iterator> where 'packer: 'a { + self.iter().filter(|frame| { + frame.matches + }) + } } impl<'pat, 'state, 'de, O: Serialize> Packer<'pat, 'state, O> { + /// Creates a new Packer. pub(crate) fn new( interp: Interpreter<'pat, 'state, O>, call_limit: usize, ) -> Self { - let ops = SmallVec::from_buf([ - InstructionReg { - instructions: &interp.pat.protos.last().unwrap()[..], - required: true, - } - ]); Self { interp: interp, call_limit: call_limit, collecting: false, - ops: ops, } } - /// Extracts the name for this element. - fn get_name(&self) -> SmallVec<[&'pat str; 1]> { - let mut name = SmallVec::<[&'pat str; 1]>::new(); - for reg in &self.ops { - match reg.instructions.first() { - | Some(PatternElement::Tag { name_and_value, .. }) - | Some(PatternElement::Value { name_and_value }) - => { - if let Some(name_key) = name_and_value.here() { - name.push(&self.interp.pat.strings[name_key]); - } - }, - None => { - // FIXME is this correct? - }, - _ => unreachable!(), - } + fn frames_mut(&mut self) -> FramesMut<'_, 'pat> { + FramesMut { + frames: self.interp.frames.borrow_mut(), + } + } + + fn frames(&mut self) -> Frames<'_, 'pat> { + Frames { + frames: self.interp.frames.borrow(), } - name } } @@ -118,67 +127,85 @@ where where D: serde::Deserializer<'de> { - match &*self.ops { - [] => unreachable!(), - [InstructionReg { - instructions: [], - .. - }] => { - // FIXME is this correct? - deserializer.deserialize_ignored_any(self) + self.frames_mut().iter_mut().for_each(|frame| { + if !frame.next() { + frame.matches = false; + } + }); + let pat = self.interp.pat; + let target_type = self.frames().iter_active().fold( + Type::IgnoredAny, + |target_type, frame| { + match (target_type, frame.get_type(pat)) { + (Type::IgnoredAny, Some((ty, _))) => ty, + (ty, Some((Type::IgnoredAny, _))) => ty, + (Type::String, Some((Type::Str, _))) => { + Type::String + }, + (Type::Str, Some((Type::String, _))) => { + Type::String + }, + (Type::Bytes, Some((Type::ByteBuf, _))) => { + Type::ByteBuf + }, + (Type::ByteBuf, Some((Type::Bytes, _))) => { + Type::ByteBuf + }, + (left, Some((right, _))) if left == right => { + left + }, + _ => Type::Any, + } + }, + ); + match target_type { + Type::Any => deserializer.deserialize_any(self), + Type::IgnoredAny => deserializer.deserialize_ignored_any(self), + Type::Bool => deserializer.deserialize_bool(self), + Type::I8 => deserializer.deserialize_i8(self), + Type::I16 => deserializer.deserialize_i16(self), + Type::I32 => deserializer.deserialize_i32(self), + Type::I64 => deserializer.deserialize_i64(self), + Type::I128 => deserializer.deserialize_i128(self), + Type::U8 => deserializer.deserialize_u8(self), + Type::U16 => deserializer.deserialize_u16(self), + Type::U32 => deserializer.deserialize_u32(self), + Type::U64 => deserializer.deserialize_u64(self), + Type::U128 => deserializer.deserialize_u128(self), + Type::F32 => deserializer.deserialize_f32(self), + Type::F64 => deserializer.deserialize_f64(self), + Type::Char => deserializer.deserialize_char(self), + Type::Str if !self.collecting => { + deserializer.deserialize_str(self) + }, + Type::Str | Type::String => deserializer.deserialize_string(self), + Type::Bytes if !self.collecting => { + deserializer.deserialize_bytes(self) + }, + Type::Bytes | Type::ByteBuf => { + deserializer.deserialize_byte_buf(self) + }, + Type::Option => deserializer.deserialize_option(self), + Type::Unit => deserializer.deserialize_unit(self), + Type::Seq => deserializer.deserialize_seq(self), + Type::Map => deserializer.deserialize_map(self), + Type::Identifier => deserializer.deserialize_identifier(self), + Type::Tuple(len) => deserializer.deserialize_tuple(len, self), + Type::UnitStruct(name) => { + deserializer.deserialize_unit_struct(name, self) + }, + Type::NewtypeStruct(name) => { + deserializer.deserialize_newtype_struct(name, self) }, - [InstructionReg { - instructions: [ins, ..], - .. - }] => match ins { - | PatternElement::Tag { name_and_value, .. } - | PatternElement::Value { name_and_value } - => { - match name_and_value.there() { - | Some(Value::String { .. }) - | Some(Value::Regex { .. }) => { - if name_and_value.is_here() { - deserializer.deserialize_string(self) - } else { - deserializer.deserialize_str(self) - } - }, - Some(Value::Type { ty, .. }) => match ty { - Type::Any => deserializer.deserialize_any(self), - Type::IgnoredAny => { - deserializer.deserialize_ignored_any(self) - }, - Type::Bool => deserializer.deserialize_bool(self), - Type::I8 => deserializer.deserialize_i8(self), - Type::I16 => deserializer.deserialize_i16(self), - Type::I32 => deserializer.deserialize_i32(self), - Type::I64 => deserializer.deserialize_i64(self), - Type::I128 => deserializer.deserialize_i128(self), - Type::U8 => deserializer.deserialize_u8(self), - Type::U16 => deserializer.deserialize_u16(self), - Type::U32 => deserializer.deserialize_u32(self), - Type::U64 => deserializer.deserialize_u64(self), - Type::U128 => deserializer.deserialize_u128(self), - Type::F32 => deserializer.deserialize_f32(self), - Type::F64 => deserializer.deserialize_f64(self), - Type::Char => deserializer.deserialize_char(self), - Type::Str => deserializer.deserialize_str(self), - Type::String => deserializer.deserialize_string(self), - Type::Bytes => deserializer.deserialize_bytes(self), - Type::ByteBuf => { - deserializer.deserialize_byte_buf(self) - }, - Type::Option => deserializer.deserialize_option(self), - Type::Unit => deserializer.deserialize_unit(self), - Type::Seq => deserializer.deserialize_seq(self), - Type::Map => deserializer.deserialize_map(self), - }, - None => todo!(), - } - }, - _ => todo!(), + Type::TupleStruct { name, len } => { + deserializer.deserialize_tuple_struct(name, len, self) + }, + Type::Struct { name, fields } => { + deserializer.deserialize_struct(name, fields, self) + }, + Type::Enum { name, variants } => { + deserializer.deserialize_enum(name, variants, self) }, - _ => todo!(), } } } @@ -186,21 +213,25 @@ where /// visit method generator for simple values (primitives). macro_rules! vs { ($visit:ident $obj:ident $t:ty) => { - fn $visit(self, v: $t) -> Result + fn $visit(mut self, v: $t) -> Result where E: serde::de::Error, { - // FIXME subtrees + // FIXME filtering/errors + let pat = self.interp.pat; let mut obj = None; - let mut pack = Pack::default(); if self.collecting { obj = Some(SerdeObject::$obj(v)); } - let mut map = IndexMap::new(); - for name in self.get_name() { - map.insert(name, (Default::default(), SerdeObject::$obj(v))); - } - pack.subpacks.push(map); + let mut pack = Pack::default(); + self.frames_mut().iter_active_mut().try_for_each(|frame| { + let mut map = IndexMap::new(); + if let Some(name) = frame.get_name(pat) { + map.insert(name, (Pack::default(), SerdeObject::$obj(v))); + } + pack.subpacks.push(map); + Ok(()) + })?; Ok((pack, obj)) } } @@ -322,9 +353,9 @@ where obj = Some(SerdeObject::Unit); } let mut map = IndexMap::new(); - for name in self.get_name() { - map.insert(name, (Default::default(), SerdeObject::Unit)); - } + //for name in self.get_name() { + // map.insert(name, (Default::default(), SerdeObject::Unit)); + //} pack.subpacks.push(map); Ok((pack, obj)) } @@ -473,12 +504,14 @@ where SerdeObject::Str(Cow::Borrowed(x)) => v.visit_borrowed_str(x), SerdeObject::Bytes(Cow::Owned(x)) => v.visit_byte_buf(x), SerdeObject::Bytes(Cow::Borrowed(x)) => v.visit_borrowed_bytes(x), - SerdeObject::Some(x) => todo!(), + SerdeObject::Some(x) => v.visit_some(x.into_deserializer()), SerdeObject::None => v.visit_none(), SerdeObject::Unit => v.visit_unit(), SerdeObject::Seq(x) => todo!(), SerdeObject::Map(x) => todo!(), - SerdeObject::NewtypeStruct(x) => todo!(), + SerdeObject::NewtypeStruct(x) => { + v.visit_newtype_struct(x.into_deserializer()) + }, SerdeObject::Enum { variant, data } => todo!(), } } @@ -515,7 +548,8 @@ mod tests { fn test_broken() { let consts = PatternConstants::<()>::default(); let mut err = Default::default(); - let interp = Interpreter::new(&consts, &mut err); + let frames = Default::default(); + let interp = Interpreter::new(&consts, &mut err, &frames); let _ = Packer::new(interp, MAX_CALLS); } @@ -524,7 +558,8 @@ mod tests { let mut consts = PatternConstants::<()>::default(); consts.protos.push(Vec::new()); let mut err = Default::default(); - let interp = Interpreter::new(&consts, &mut err); + let frames = Default::default(); + let interp = Interpreter::new(&consts, &mut err, &frames); let _ = Packer::new(interp, MAX_CALLS); } @@ -534,7 +569,8 @@ mod tests { consts.protos.push(Vec::new()); let mut der = JsonDeserializer::from_str("{}"); let mut err = Default::default(); - let interp = Interpreter::new(&consts, &mut err); + let frames = Default::default(); + let interp = Interpreter::new(&consts, &mut err, &frames); let pack = Packer::new(interp, MAX_CALLS).deserialize(&mut der).unwrap(); } @@ -552,8 +588,11 @@ mod tests { ]); let mut der = JsonDeserializer::from_str("3"); let mut err = Default::default(); - let interp = Interpreter::new(&consts, &mut err); - let pack = Packer::new(interp, MAX_CALLS).deserialize(&mut der).unwrap().0; + let frames = Default::default(); + let interp = Interpreter::new(&consts, &mut err, &frames); + let packed = Packer::new(interp, MAX_CALLS).deserialize(&mut der); + let (pack, obj) = packed.unwrap(); + assert!(obj.is_none()); assert_eq!(pack.subpacks[0]["hello"].1, SerdeObject::U64(3)); } } diff --git a/src/vm/mod.rs b/src/vm/mod.rs index 5f1e86c..190fa3d 100644 --- a/src/vm/mod.rs +++ b/src/vm/mod.rs @@ -7,6 +7,7 @@ use std::borrow::Cow; use std::cell::Cell; +use std::cell::RefCell; use std::collections::BTreeMap; use std::marker::PhantomData; @@ -212,7 +213,7 @@ pub(crate) enum PatternToken { /// /// These are used as expectations for serde (e.g. /// `Deserializer::deserialize_string`). -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] pub(crate) enum Type { Any, IgnoredAny, @@ -237,17 +238,23 @@ pub(crate) enum Type { Option, Unit, Seq, - // Tuple(usize), + Tuple(usize), Map, - // // these aren't really supported: - // // UnitStruct, UnitVariant, NewtypeStruct, NewtypeVariant, TupleStruct, - // // TupleVariant, Struct, StructVariant - // // instead we use type trees for that. - // /// Adapter for Type Trees. See `crate::type_tree` for more details. - // Of { - // /// The type tree index (in `PatternConstants.type_trees`). - // type_tree: usize, - // }, + UnitStruct(&'static str), + NewtypeStruct(&'static str), + TupleStruct { + name: &'static str, + len: usize, + }, + Identifier, + Struct { + name: &'static str, + fields: &'static [&'static str], + }, + Enum { + name: &'static str, + variants: &'static [&'static str], + }, } /// The types which can be deserialized by serde. @@ -278,8 +285,6 @@ pub(crate) enum SerdeObject<'de> { // NOTE: support for multimaps! Map(Vec<(SerdeObject<'de>, SerdeObject<'de>)>), NewtypeStruct(Box>), - // NOTE: currently unused! - #[allow(unused)] Enum { variant: Box>, data: Box>, @@ -351,69 +356,131 @@ pub(crate) struct Interpreter<'pat, 'state, O: Serialize> { pat: &'pat PatternConstants, /// The error override (if any). error: &'state Cell>, + /// The current interpreter frames. + frames: &'state RefCell>>, + ///// The final output. + //output: &'state Cell>, } -struct Frame<'pat, 'de> { +pub(crate) struct Frame<'pat> { /// The instructions/function currently being processed. ops: &'pat [PatternElement], /// The instruction index being processed. iar: Option, - /// Elements collected while processing this frame? - path: Vec>, + /// Whether this frame matches. + matches: bool, + ///// Elements collected while processing this frame? + //path: Pack<'pat, 'de>, } impl<'pat, 'state, O: Serialize> Interpreter<'pat, 'state, O> { pub(crate) fn new( pat: &'pat PatternConstants, error: &'state mut Option, + frames: &'state RefCell>>, + //output: &'state mut Pack<'pat, 'de>, ) -> Self { + let mut mut_frames = frames.borrow_mut(); + debug_assert!(mut_frames.is_empty()); + mut_frames.push(Frame { + ops: &pat.protos[0], + iar: None, + matches: true, + //path: Default::default(), + }); + drop(mut_frames); Self { pat: pat, error: Cell::from_mut(error), - //frames: vec![ - // Frame { - // ops: &pat.protos[0], - // iar: None, - // path: Vec::new(), - // } - //], + frames: frames, + //output: Cell::from_mut(output), } } } -// -//impl<'a, 'b, T: PatternTypes> Frame<'a, 'b, T> { -// /// Advances the instruction address register. -// /// -// /// # Returns -// /// -// /// `true` if successful, `false` otherwise. -// fn next(&mut self) -> bool { -// let new = self.iar.map_or(0, |v| v + 1); -// new < self.ops.len() && { -// self.iar = Some(new); -// true -// } -// } -// -// /// Returns the current instruction. -// fn op(&self) -> PatternElement { -// self.ops[self.iar.expect("ops[iar]")] -// } -// -// /// Rewinds the instruction address register. -// /// -// /// # Returns -// /// -// /// `true` if successful, `false` otherwise. -// fn prev(&mut self) -> bool { -// let new = self.iar.expect("iar").checked_sub(1); -// new.is_some() && { -// self.iar = new; -// true -// } -// } -//} -// + +impl<'pat> Frame<'pat> { + /// Gets the type currently associated with this frame. + /// + /// Returns the type and whether it is required to match. + fn get_type( + &self, + pat: &'pat PatternConstants, + ) -> Option<(Type, bool)> { + match self.op() { + | PatternElement::Value { name_and_value, .. } + | PatternElement::Tag { name_and_value, .. } + if name_and_value.is_there() + => { + match name_and_value.there() { + | Some(Value::String { skippable, .. }) + | Some(Value::Regex { skippable, .. }) + => { + Some((Type::Str, !skippable)) + }, + Some(Value::Type { ty, skippable }) => { + Some((ty, !skippable)) + }, + None => todo!(), + } + }, + _ => None, + } + } + + /// Gets the name currently associated with this frame. + fn get_name( + &self, + pat: &'pat PatternConstants, + ) -> Option<&'pat str> { + let strings = &pat.strings; + match self.op() { + | PatternElement::Value { name_and_value, .. } + | PatternElement::Tag { name_and_value, .. } + if name_and_value.is_here() + => { + Some(&*strings[name_and_value.here().unwrap()]) + }, + _ => None, + } + } + + /// Advances the instruction address register. + /// + /// # Returns + /// + /// `true` if successful, `false` otherwise. + fn next(&mut self) -> bool { + let new = self.iar.map_or(0, |v| v + 1); + new < self.ops.len() && { + self.iar = Some(new); + true + } + } + + /// Returns the current instruction. + /// + /// # Panics + /// + /// Panics if called on a non-matching frame or if iteration hasn't begun. + fn op(&self) -> PatternElement { + assert!(self.matches, "op() called on non-matching frame"); + self.ops[self.iar.expect("ops[iar]")] + } + + /// Rewinds the instruction address register. + /// + /// # Returns + /// + /// `true` if successful, `false` otherwise. + fn prev(&mut self) -> bool { + let new = self.iar.expect("iar").checked_sub(1); + new.is_some() && { + self.iar = new; + true + } + } +} + ///// Stores a single match. ///// ///// See also Holder. -- cgit 1.4.1