From 75eb816a6372518b6c047a266089875a358e9517 Mon Sep 17 00:00:00 2001 From: Profpatsch Date: Fri, 5 Jun 2020 23:54:51 +0200 Subject: pkgs/profpatsch: rename encode to netencode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Less generic, has the spirit of “netstrings, but extended to a structured encoding format”. --- pkgs/profpatsch/default.nix | 6 +- pkgs/profpatsch/encode/default.nix | 42 ---- pkgs/profpatsch/encode/encode.rs | 399 --------------------------------- pkgs/profpatsch/encode/spec.md | 92 -------- pkgs/profpatsch/netencode/default.nix | 42 ++++ pkgs/profpatsch/netencode/netencode.rs | 399 +++++++++++++++++++++++++++++++++ pkgs/profpatsch/netencode/spec.md | 92 ++++++++ 7 files changed, 536 insertions(+), 536 deletions(-) delete mode 100644 pkgs/profpatsch/encode/default.nix delete mode 100644 pkgs/profpatsch/encode/encode.rs delete mode 100644 pkgs/profpatsch/encode/spec.md create mode 100644 pkgs/profpatsch/netencode/default.nix create mode 100644 pkgs/profpatsch/netencode/netencode.rs create mode 100644 pkgs/profpatsch/netencode/spec.md diff --git a/pkgs/profpatsch/default.nix b/pkgs/profpatsch/default.nix index d2a968c9..89f153cd 100644 --- a/pkgs/profpatsch/default.nix +++ b/pkgs/profpatsch/default.nix @@ -215,8 +215,8 @@ in rec { Prelude ; - inherit (import ./encode { inherit pkgs writeRustSimpleLib; }) - encode-rs - encode-rs-tests + inherit (import ./netencode { inherit pkgs writeRustSimpleLib; }) + netencode-rs + netencode-rs-tests ; } diff --git a/pkgs/profpatsch/encode/default.nix b/pkgs/profpatsch/encode/default.nix deleted file mode 100644 index 9661c0ed..00000000 --- a/pkgs/profpatsch/encode/default.nix +++ /dev/null @@ -1,42 +0,0 @@ -{ pkgs, writeRustSimpleLib }: - -let - version-check = pkgs.buildRustCrate { - pname = "version-check"; - version = "0.9.2"; - crateName = "version-check"; - sha256 = "1vwvc1mzwv8ana9jv8z933p2xzgj1533qwwl5zr8mi89azyhq21v"; - }; - memchr = pkgs.buildRustCrate { - pname = "memchr"; - version = "2.3.3"; - crateName = "memchr"; - sha256 = "1ivxvlswglk6wd46gadkbbsknr94gwryk6y21v64ja7x4icrpihw"; - }; - nom = pkgs.buildRustCrate { - pname = "nom"; - version = "5.1.1"; - crateName = "nom"; - sha256 = "1gb4r6mjwd645jqh02nhn60i7qkw8cgy3xq1r4clnmvz3cmkv1l0"; - dependencies = [ memchr ]; - buildDependencies = [ version-check ]; - features = [ "std" "alloc" ]; - }; - - encode-rs-common = tests: writeRustSimpleLib "encode" { - dependencies = [ nom ]; - buildTests = tests; - release = false; - verbose = true; - } ./encode.rs ; - - encode-rs-tests = encode-rs-common true; - - encode-rs = encode-rs-common false; - -in { - inherit - encode-rs - encode-rs-tests - ; -} diff --git a/pkgs/profpatsch/encode/encode.rs b/pkgs/profpatsch/encode/encode.rs deleted file mode 100644 index b83aca7b..00000000 --- a/pkgs/profpatsch/encode/encode.rs +++ /dev/null @@ -1,399 +0,0 @@ -extern crate nom; - -use std::collections::HashMap; -use std::io::Write; - -#[derive(Debug, PartialEq, Eq, Clone)] -pub enum T { - // Unit - Unit, - // Naturals - N3(u8), - N6(u64), - N7(u128), - // Integers - I3(i8), - I6(i64), - I7(i128), - // Text - // TODO: make into &str - Text(String), - // Tags - Sum(Tag), - // TODO: make into &str - Record(HashMap>), - List(Box>), -} - -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct Tag { - // TODO: make into &str - pub tag: String, - pub val: Box -} - -fn encode_tag(w: &mut W, tag: String, val: T) -> std::io::Result<()> { - write!(w, "<{}:{}|", tag.len(), tag)?; - encode(w, val)?; - Ok(()) -} - -pub fn encode(w: &mut W, t: T) -> std::io::Result<()> { - match t { - T::Unit => write!(w, "u,"), - T::N3(n) => write!(w, "n3:{},", n), - T::N6(n) => write!(w, "n6:{},", n), - T::N7(n) => write!(w, "n7:{},", n), - T::I3(i) => write!(w, "i3:{},", i), - T::I6(i) => write!(w, "i6:{},", i), - T::I7(i) => write!(w, "i7:{},", i), - T::Text(s) => write!(w, "t{}:{},", s.len(), s), - T::Sum(Tag{tag, val}) => encode_tag(w, tag, *val), - T::Record(m) => { - let mut c = std::io::Cursor::new(vec![]); - for (k, v) in m { - encode_tag(&mut c, k, *v)?; - } - write!(w, "{{{}:", c.get_ref().len())?; - w.write(c.get_ref())?; - write!(w, "}}") - }, - T::List(l) => { - let mut c = std::io::Cursor::new(vec![]); - for v in *l { - encode(&mut c, v)?; - }; - write!(w, "[{}:", c.get_ref().len())?; - w.write(c.get_ref())?; - write!(w, "]") - } - } -} - -pub fn dict(d: Vec<(String, T)>) -> T { - T::Record( - d.into_iter() - .map(|(k,v)| (k, Box::new(v))) - // to ignore duplicate entries after the first - .rev() - .collect::>()) -} - -pub fn text(s: String) -> T { - T::Text(s) -} - -mod parse { - use super::{T, Tag}; - - use std::str::FromStr; - use std::ops::Neg; - use std::collections::HashMap; - - use nom::{IResult}; - use nom::bytes::complete::{tag, take}; - use nom::branch::{alt}; - use nom::character::complete::{digit1, char}; - use nom::sequence::{tuple}; - use nom::combinator::{map, map_res, flat_map, opt}; - use nom::error::{context, ErrorKind, ParseError}; - - fn unit_t(s: &[u8]) -> IResult<&[u8], ()> { - let (s, _) = context("unit", tag("u,"))(s)?; - Ok((s, ())) - } - - fn usize_t(s: &[u8]) -> IResult<&[u8], usize> { - context( - "usize", - map_res( - map_res(digit1, |n| std::str::from_utf8(n)), - |s| s.parse::()) - )(s) - } - - - fn uint_t<'a, I: FromStr + 'a>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], I> { - move |s: &'a [u8]| { - let (s, (_, _, int, _)) = tuple(( - tag(t.as_bytes()), - char(':'), - map_res( - map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), - |s| s.parse::() - ), - char(',') - ))(s)?; - Ok((s, int)) - } - } - - fn int_t<'a, I: FromStr + Neg>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&[u8], I> { - context( - t, - move |s: &'a [u8]| { - let (s, (_, _, neg, int, _)) = tuple(( - tag(t.as_bytes()), - char(':'), - opt(char('-')), - map_res( - map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), - |s| s.parse::() - ), - char(',') - ))(s)?; - let res = match neg { - Some(_) => -int, - None => int, - }; - Ok((s, res)) - } - ) - } - - fn tag_t(s: &[u8]) -> IResult<&[u8], Tag> { - let (s, (_, len, _)) = tuple(( - char('<'), - usize_t, - char(':'), - ))(s)?; - let (s, (tag, _, recurse)) = tuple(( - take(len), - char('|'), - // recurses into the main parser - t_t - ))(s)?; - Ok((s, Tag { - tag: std::str::from_utf8(tag) - .map_err(|_| nom::Err::Failure((s, ErrorKind::Char))) - .map(|s| s.to_string())?, - val: Box::new(recurse) - })) - } - - /// parse text scalar (`t5:hello,`) - fn text(s: &[u8]) -> IResult<&[u8], T> { - let (s, (_, len, _)) = tuple(( - char('t'), - usize_t, - char(':') - ))(s)?; - let (s, (res, _)) = tuple(( - take(len), - char(',') - ))(s)?; - Ok((s, T::Text( - std::str::from_utf8(res) - .map_err(|_| nom::Err::Failure((s, ErrorKind::Char))) - .map(|s| s.to_string())?, - ))) - } - - fn list_t(s: &[u8]) -> IResult<&[u8], Vec> { - let (s, (_, _, _, vec, _)) = tuple(( - char('['), - usize_t, - char(':'), - nom::multi::many0(t_t), - char(']') - ))(s)?; - Ok((s, vec)) - } - - fn record_t(s: &[u8]) -> IResult<&[u8], HashMap>> { - let (s, (_, _, _, map, _)) = tuple(( - char('{'), - usize_t, - char(':'), - nom::multi::fold_many1( - tag_t, - HashMap::new(), - |mut acc: HashMap<_, _>, Tag { tag, val }| { - // ignore duplicated tag names that appear later - if !acc.contains_key(&tag) { - acc.insert(tag, val); - } - acc - } - ), - char('}') - ))(s)?; - Ok((s, map)) - } - - fn t_t(s: &[u8]) -> IResult<&[u8], T> { - alt(( - text, - map(unit_t, |_| T::Unit), - map(tag_t, |t| T::Sum(t)), - map(list_t, |l| T::List(Box::new(l))), - map(record_t, |p| T::Record(p)), - - // 8, 64 and 128 bit - map(uint_t("n3"), |u| T::N3(u)), - map(uint_t("n6"), |u| T::N6(u)), - map(uint_t("n7"), |u| T::N7(u)), - map(int_t("i3"), |u| T::I3(u)), - map(int_t("i6"), |u| T::I6(u)), - map(int_t("i7"), |u| T::I7(u)), - - // less common - map(uint_t("n1"), |u| T::N3(u)), - map(uint_t("n2"), |u| T::N3(u)), - map(uint_t("n4"), |u| T::N6(u)), - map(uint_t("n5"), |u| T::N6(u)), - map(int_t("i1"), |u| T::I3(u)), - map(int_t("i2"), |u| T::I3(u)), - map(int_t("i4"), |u| T::I6(u)), - map(int_t("i5"), |u| T::I6(u)), - // TODO: 8, 9 not supported - ))(s) - } - - #[cfg(test)] - mod tests { - use super::*; - - #[test] - fn test_parse_unit_t() { - assert_eq!( - unit_t("u,".as_bytes()), - Ok(("".as_bytes(), ())) - ); - } - - #[test] - fn test_parse_usize_t() { - assert_eq!( - usize_t("32foo".as_bytes()), - Ok(("foo".as_bytes(), 32)) - ); - } - - #[test] - fn test_parse_int_t() { - assert_eq!( - uint_t::("n3")("n3:42,abc".as_bytes()), - Ok(("abc".as_bytes(), 42)) - ); - assert_eq!( - uint_t::("n3")("n3:1024,abc".as_bytes()), - Err(nom::Err::Error(("1024,abc".as_bytes(), nom::error::ErrorKind::MapRes))) - ); - assert_eq!( - int_t::("i6")("i6:-23,abc".as_bytes()), - Ok(("abc".as_bytes(), -23)) - ); - assert_eq!( - int_t::("i3")("i3:0,:abc".as_bytes()), - Ok((":abc".as_bytes(), 0)) - ); - assert_eq!( - uint_t::("n7")("n7:09,".as_bytes()), - Ok(("".as_bytes(), 9)) - ); - // assert_eq!( - // length("c"), - // Err(nom::Err::Error(("c", nom::error::ErrorKind::Digit))) - // ); - // assert_eq!( - // length(":"), - // Err(nom::Err::Error((":", nom::error::ErrorKind::Digit))) - // ); - } - - #[test] - fn test_parse_text() { - assert_eq!( - text("t5:hello,".as_bytes()), - Ok(("".as_bytes(), T::Text("hello".to_owned()))) - ); - assert_eq!( - text("t4:fo,".as_bytes()), - // TODO: way better parse error messages - Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) - ); - assert_eq!( - text("t9:今日は,".as_bytes()), - Ok(("".as_bytes(), T::Text("今日は".to_owned()))) - ); - } - - #[test] - fn test_list() { - assert_eq!( - list_t("[0:]".as_bytes()), - Ok(("".as_bytes(), vec![])) - ); - assert_eq!( - list_t("[6:u,u,u,]".as_bytes()), - Ok(("".as_bytes(), vec![ - T::Unit, - T::Unit, - T::Unit, - ])) - ); - assert_eq!( - list_t("[15:u,[7:t3:foo,]u,]".as_bytes()), - Ok(("".as_bytes(), vec![ - T::Unit, - T::List(Box::new(vec![T::Text("foo".to_owned())])), - T::Unit, - ])) - ); - } - - #[test] - fn test_record() { - assert_eq!( - record_t("{21:<1:a|u,<1:b|u,<1:c|u,}".as_bytes()), - Ok(("".as_bytes(), vec![ - ("a".to_owned(), Box::new(T::Unit)), - ("b".to_owned(), Box::new(T::Unit)), - ("c".to_owned(), Box::new(T::Unit)), - ].into_iter().collect::>>())) - ); - // duplicated keys are ignored (first is taken) - assert_eq!( - record_t("{25:<1:a|u,<1:b|u,<1:a|i1:-1,}".as_bytes()), - Ok(("".as_bytes(), vec![ - ("a".to_owned(), Box::new(T::Unit)), - ("b".to_owned(), Box::new(T::Unit)), - ].into_iter().collect::>())) - ); - } - - #[test] - fn test_parse() { - assert_eq!( - t_t("n3:255,".as_bytes()), - Ok(("".as_bytes(), T::N3(255))) - ); - assert_eq!( - t_t("t6:halloo,".as_bytes()), - Ok(("".as_bytes(), T::Text("halloo".to_owned()))) - ); - assert_eq!( - t_t("<3:foo|t6:halloo,".as_bytes()), - Ok(("".as_bytes(), T::Sum (Tag { - tag: "foo".to_owned(), - val: Box::new(T::Text("halloo".to_owned())) - }))) - ); - // { a: Unit - // , foo: List } - assert_eq!( - t_t("{49:<1:a|u,<3:foo|[30:<1:A|u,<1:A|u,<1:B|[7:i3:127,]]}".as_bytes()), - Ok(("".as_bytes(), T::Record(vec![ - ("a".to_owned(), Box::new(T::Unit)), - ("foo".to_owned(), Box::new(T::List(Box::new(vec![ - T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::Unit) }), - T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::Unit) }), - T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(Box::new(vec![T::I3(127)]))) }), - ])))) - ].into_iter().collect::>>()))) - ); - } - - } -} diff --git a/pkgs/profpatsch/encode/spec.md b/pkgs/profpatsch/encode/spec.md deleted file mode 100644 index cd38588c..00000000 --- a/pkgs/profpatsch/encode/spec.md +++ /dev/null @@ -1,92 +0,0 @@ -# encode 0.1-unreleased - -[bencode][] and [netstring][]-inspired pipe format that should be trivial to parse (100 lines of code or less), mostly human-decipherable for easy debugging, and support nested record and sum types. - - -## scalars - -Scalars have the format `[type prefix][size]:[value],`. - -where size is a natural number without leading zeroes. - -### unit - -The unit (`u`) has only one value. - -* The unit is: `u,` - -### numbers - -Naturals (`n`) and Integers (`i`), with a maximum size in bits. - -Bit sizes are specified in 2^n increments, 1 to 9 (`n1`..`n9`, `i1`..`n9`). - -* Natural `1234` that fits in 32 bits (2^5): `n5:1234,` -* Integer `-42` that fits in 8 bits (2^3): `i3:-42,` -* Integer `23` that fits in 64 bits (2^6): `i6:23,` -* Integer `-1` that fits in 512 bits (2^9): `i9:-1,` -* Natural `0` that fits in 1 bit (2^1): `n1:0,` - -An implementation can define the biggest numbers it supports, and has to throw an error for anything bigger. It has to support everything smaller, so for example if you support up to i6/n6, you have to support 1–6 as well. An implementation could support up to the current architecture’s wordsize for example. - -Floats are not supported, you can implement fixed-size decimals or ratios using integers. - -### text - -Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes: - -* The string `hello world` (11 bytes): `t11:hello world,` -* The string `今日は` (9 bytes): `t9:今日は,` -* The string `:,` (2 bytes): `t2::,,` -* The empty sting `` (0 bytes): `t0:,` - -Binary data is not supported, it hinders human readability. Try to use structured data, or use a different format. - -## tagged values - -### tags - -A tag (`<`) gives a value a name. The tag is UTF-8 encoded, starting with its length in bytes and proceeding with the value. - -* The tag `foo` (3 bytes) tagging the text `hello` (5 bytes): `<3:foo|t5:hello,` -* The tag `` (0 bytes) tagging the 8-bit integer 0: `<0:|i3:0,` - -### records (products/records), also maps - -A record (`{`) is a concatenation of tags (`<`). It needs to be closed with `}`. -If tag names repeat the later ones should be ignored. Ordering does not matter. - -Similar to text, records start with the length of their *whole encoded content*, in bytes. This makes it possible to treat their contents as opaque bytestrings. - -* There is no empty record. (TODO: make the empty record the unit type, remove `u,`?) -* A record with one empty field, `foo`: `{9:<3:foo|u,}` -* A record with two fields, `foo` and `x`: `{21:<3:foo|u,<1:x|t3:baz,}` -* The same record: `{21:<1:x|t3:baz,<3:foo|u,}` -* The same record (later occurences of fields are ignored): `{28:<1:x|t3:baz,<3:foo|u,<1:x|u,}` - -### sums (tagged unions) - -Simply a tagged value. The tag marker `<` indicates it is a sum if it appears outside of a record. - -## lists - -A list (`[`) imposes an ordering on a sequence of values. It needs to be closed with `]`. Values in it are simply concatenated. - -Similar to records, lists start with the length of their whole encoded content. - -* The empty list: `[0:]` -* The list with one element, the string `foo`: `[7:t3:foo,]` -* The list with text `foo` followed by i3 `-42`: `[14:t3:foo,i3:-42,]` -* The list with `Some` and `None` tags: `[33:<4:Some|t3:foo,<4None|u,<4None|u,]` - -## motivation - -TODO - -## guarantees - -TODO: do I want unique representation (bijection like bencode?) This would put more restrictions on the generator, like sorting records in lexicographic order, but would make it possible to compare without decoding - - -[bencode]: https://en.wikipedia.org/wiki/Bencode -[netstring]: https://en.wikipedia.org/wiki/Netstring diff --git a/pkgs/profpatsch/netencode/default.nix b/pkgs/profpatsch/netencode/default.nix new file mode 100644 index 00000000..31c6fb8d --- /dev/null +++ b/pkgs/profpatsch/netencode/default.nix @@ -0,0 +1,42 @@ +{ pkgs, writeRustSimpleLib }: + +let + version-check = pkgs.buildRustCrate { + pname = "version-check"; + version = "0.9.2"; + crateName = "version-check"; + sha256 = "1vwvc1mzwv8ana9jv8z933p2xzgj1533qwwl5zr8mi89azyhq21v"; + }; + memchr = pkgs.buildRustCrate { + pname = "memchr"; + version = "2.3.3"; + crateName = "memchr"; + sha256 = "1ivxvlswglk6wd46gadkbbsknr94gwryk6y21v64ja7x4icrpihw"; + }; + nom = pkgs.buildRustCrate { + pname = "nom"; + version = "5.1.1"; + crateName = "nom"; + sha256 = "1gb4r6mjwd645jqh02nhn60i7qkw8cgy3xq1r4clnmvz3cmkv1l0"; + dependencies = [ memchr ]; + buildDependencies = [ version-check ]; + features = [ "std" "alloc" ]; + }; + + netencode-rs-common = tests: writeRustSimpleLib "encode" { + dependencies = [ nom ]; + buildTests = tests; + release = false; + verbose = true; + } ./netencode.rs ; + + netencode-rs-tests = netencode-rs-common true; + + netencode-rs = netencode-rs-common false; + +in { + inherit + netencode-rs + netencode-rs-tests + ; +} diff --git a/pkgs/profpatsch/netencode/netencode.rs b/pkgs/profpatsch/netencode/netencode.rs new file mode 100644 index 00000000..b83aca7b --- /dev/null +++ b/pkgs/profpatsch/netencode/netencode.rs @@ -0,0 +1,399 @@ +extern crate nom; + +use std::collections::HashMap; +use std::io::Write; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum T { + // Unit + Unit, + // Naturals + N3(u8), + N6(u64), + N7(u128), + // Integers + I3(i8), + I6(i64), + I7(i128), + // Text + // TODO: make into &str + Text(String), + // Tags + Sum(Tag), + // TODO: make into &str + Record(HashMap>), + List(Box>), +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Tag { + // TODO: make into &str + pub tag: String, + pub val: Box +} + +fn encode_tag(w: &mut W, tag: String, val: T) -> std::io::Result<()> { + write!(w, "<{}:{}|", tag.len(), tag)?; + encode(w, val)?; + Ok(()) +} + +pub fn encode(w: &mut W, t: T) -> std::io::Result<()> { + match t { + T::Unit => write!(w, "u,"), + T::N3(n) => write!(w, "n3:{},", n), + T::N6(n) => write!(w, "n6:{},", n), + T::N7(n) => write!(w, "n7:{},", n), + T::I3(i) => write!(w, "i3:{},", i), + T::I6(i) => write!(w, "i6:{},", i), + T::I7(i) => write!(w, "i7:{},", i), + T::Text(s) => write!(w, "t{}:{},", s.len(), s), + T::Sum(Tag{tag, val}) => encode_tag(w, tag, *val), + T::Record(m) => { + let mut c = std::io::Cursor::new(vec![]); + for (k, v) in m { + encode_tag(&mut c, k, *v)?; + } + write!(w, "{{{}:", c.get_ref().len())?; + w.write(c.get_ref())?; + write!(w, "}}") + }, + T::List(l) => { + let mut c = std::io::Cursor::new(vec![]); + for v in *l { + encode(&mut c, v)?; + }; + write!(w, "[{}:", c.get_ref().len())?; + w.write(c.get_ref())?; + write!(w, "]") + } + } +} + +pub fn dict(d: Vec<(String, T)>) -> T { + T::Record( + d.into_iter() + .map(|(k,v)| (k, Box::new(v))) + // to ignore duplicate entries after the first + .rev() + .collect::>()) +} + +pub fn text(s: String) -> T { + T::Text(s) +} + +mod parse { + use super::{T, Tag}; + + use std::str::FromStr; + use std::ops::Neg; + use std::collections::HashMap; + + use nom::{IResult}; + use nom::bytes::complete::{tag, take}; + use nom::branch::{alt}; + use nom::character::complete::{digit1, char}; + use nom::sequence::{tuple}; + use nom::combinator::{map, map_res, flat_map, opt}; + use nom::error::{context, ErrorKind, ParseError}; + + fn unit_t(s: &[u8]) -> IResult<&[u8], ()> { + let (s, _) = context("unit", tag("u,"))(s)?; + Ok((s, ())) + } + + fn usize_t(s: &[u8]) -> IResult<&[u8], usize> { + context( + "usize", + map_res( + map_res(digit1, |n| std::str::from_utf8(n)), + |s| s.parse::()) + )(s) + } + + + fn uint_t<'a, I: FromStr + 'a>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], I> { + move |s: &'a [u8]| { + let (s, (_, _, int, _)) = tuple(( + tag(t.as_bytes()), + char(':'), + map_res( + map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), + |s| s.parse::() + ), + char(',') + ))(s)?; + Ok((s, int)) + } + } + + fn int_t<'a, I: FromStr + Neg>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&[u8], I> { + context( + t, + move |s: &'a [u8]| { + let (s, (_, _, neg, int, _)) = tuple(( + tag(t.as_bytes()), + char(':'), + opt(char('-')), + map_res( + map_res(digit1, |n: &[u8]| std::str::from_utf8(n)), + |s| s.parse::() + ), + char(',') + ))(s)?; + let res = match neg { + Some(_) => -int, + None => int, + }; + Ok((s, res)) + } + ) + } + + fn tag_t(s: &[u8]) -> IResult<&[u8], Tag> { + let (s, (_, len, _)) = tuple(( + char('<'), + usize_t, + char(':'), + ))(s)?; + let (s, (tag, _, recurse)) = tuple(( + take(len), + char('|'), + // recurses into the main parser + t_t + ))(s)?; + Ok((s, Tag { + tag: std::str::from_utf8(tag) + .map_err(|_| nom::Err::Failure((s, ErrorKind::Char))) + .map(|s| s.to_string())?, + val: Box::new(recurse) + })) + } + + /// parse text scalar (`t5:hello,`) + fn text(s: &[u8]) -> IResult<&[u8], T> { + let (s, (_, len, _)) = tuple(( + char('t'), + usize_t, + char(':') + ))(s)?; + let (s, (res, _)) = tuple(( + take(len), + char(',') + ))(s)?; + Ok((s, T::Text( + std::str::from_utf8(res) + .map_err(|_| nom::Err::Failure((s, ErrorKind::Char))) + .map(|s| s.to_string())?, + ))) + } + + fn list_t(s: &[u8]) -> IResult<&[u8], Vec> { + let (s, (_, _, _, vec, _)) = tuple(( + char('['), + usize_t, + char(':'), + nom::multi::many0(t_t), + char(']') + ))(s)?; + Ok((s, vec)) + } + + fn record_t(s: &[u8]) -> IResult<&[u8], HashMap>> { + let (s, (_, _, _, map, _)) = tuple(( + char('{'), + usize_t, + char(':'), + nom::multi::fold_many1( + tag_t, + HashMap::new(), + |mut acc: HashMap<_, _>, Tag { tag, val }| { + // ignore duplicated tag names that appear later + if !acc.contains_key(&tag) { + acc.insert(tag, val); + } + acc + } + ), + char('}') + ))(s)?; + Ok((s, map)) + } + + fn t_t(s: &[u8]) -> IResult<&[u8], T> { + alt(( + text, + map(unit_t, |_| T::Unit), + map(tag_t, |t| T::Sum(t)), + map(list_t, |l| T::List(Box::new(l))), + map(record_t, |p| T::Record(p)), + + // 8, 64 and 128 bit + map(uint_t("n3"), |u| T::N3(u)), + map(uint_t("n6"), |u| T::N6(u)), + map(uint_t("n7"), |u| T::N7(u)), + map(int_t("i3"), |u| T::I3(u)), + map(int_t("i6"), |u| T::I6(u)), + map(int_t("i7"), |u| T::I7(u)), + + // less common + map(uint_t("n1"), |u| T::N3(u)), + map(uint_t("n2"), |u| T::N3(u)), + map(uint_t("n4"), |u| T::N6(u)), + map(uint_t("n5"), |u| T::N6(u)), + map(int_t("i1"), |u| T::I3(u)), + map(int_t("i2"), |u| T::I3(u)), + map(int_t("i4"), |u| T::I6(u)), + map(int_t("i5"), |u| T::I6(u)), + // TODO: 8, 9 not supported + ))(s) + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn test_parse_unit_t() { + assert_eq!( + unit_t("u,".as_bytes()), + Ok(("".as_bytes(), ())) + ); + } + + #[test] + fn test_parse_usize_t() { + assert_eq!( + usize_t("32foo".as_bytes()), + Ok(("foo".as_bytes(), 32)) + ); + } + + #[test] + fn test_parse_int_t() { + assert_eq!( + uint_t::("n3")("n3:42,abc".as_bytes()), + Ok(("abc".as_bytes(), 42)) + ); + assert_eq!( + uint_t::("n3")("n3:1024,abc".as_bytes()), + Err(nom::Err::Error(("1024,abc".as_bytes(), nom::error::ErrorKind::MapRes))) + ); + assert_eq!( + int_t::("i6")("i6:-23,abc".as_bytes()), + Ok(("abc".as_bytes(), -23)) + ); + assert_eq!( + int_t::("i3")("i3:0,:abc".as_bytes()), + Ok((":abc".as_bytes(), 0)) + ); + assert_eq!( + uint_t::("n7")("n7:09,".as_bytes()), + Ok(("".as_bytes(), 9)) + ); + // assert_eq!( + // length("c"), + // Err(nom::Err::Error(("c", nom::error::ErrorKind::Digit))) + // ); + // assert_eq!( + // length(":"), + // Err(nom::Err::Error((":", nom::error::ErrorKind::Digit))) + // ); + } + + #[test] + fn test_parse_text() { + assert_eq!( + text("t5:hello,".as_bytes()), + Ok(("".as_bytes(), T::Text("hello".to_owned()))) + ); + assert_eq!( + text("t4:fo,".as_bytes()), + // TODO: way better parse error messages + Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) + ); + assert_eq!( + text("t9:今日は,".as_bytes()), + Ok(("".as_bytes(), T::Text("今日は".to_owned()))) + ); + } + + #[test] + fn test_list() { + assert_eq!( + list_t("[0:]".as_bytes()), + Ok(("".as_bytes(), vec![])) + ); + assert_eq!( + list_t("[6:u,u,u,]".as_bytes()), + Ok(("".as_bytes(), vec![ + T::Unit, + T::Unit, + T::Unit, + ])) + ); + assert_eq!( + list_t("[15:u,[7:t3:foo,]u,]".as_bytes()), + Ok(("".as_bytes(), vec![ + T::Unit, + T::List(Box::new(vec![T::Text("foo".to_owned())])), + T::Unit, + ])) + ); + } + + #[test] + fn test_record() { + assert_eq!( + record_t("{21:<1:a|u,<1:b|u,<1:c|u,}".as_bytes()), + Ok(("".as_bytes(), vec![ + ("a".to_owned(), Box::new(T::Unit)), + ("b".to_owned(), Box::new(T::Unit)), + ("c".to_owned(), Box::new(T::Unit)), + ].into_iter().collect::>>())) + ); + // duplicated keys are ignored (first is taken) + assert_eq!( + record_t("{25:<1:a|u,<1:b|u,<1:a|i1:-1,}".as_bytes()), + Ok(("".as_bytes(), vec![ + ("a".to_owned(), Box::new(T::Unit)), + ("b".to_owned(), Box::new(T::Unit)), + ].into_iter().collect::>())) + ); + } + + #[test] + fn test_parse() { + assert_eq!( + t_t("n3:255,".as_bytes()), + Ok(("".as_bytes(), T::N3(255))) + ); + assert_eq!( + t_t("t6:halloo,".as_bytes()), + Ok(("".as_bytes(), T::Text("halloo".to_owned()))) + ); + assert_eq!( + t_t("<3:foo|t6:halloo,".as_bytes()), + Ok(("".as_bytes(), T::Sum (Tag { + tag: "foo".to_owned(), + val: Box::new(T::Text("halloo".to_owned())) + }))) + ); + // { a: Unit + // , foo: List } + assert_eq!( + t_t("{49:<1:a|u,<3:foo|[30:<1:A|u,<1:A|u,<1:B|[7:i3:127,]]}".as_bytes()), + Ok(("".as_bytes(), T::Record(vec![ + ("a".to_owned(), Box::new(T::Unit)), + ("foo".to_owned(), Box::new(T::List(Box::new(vec![ + T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::Unit) }), + T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::Unit) }), + T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(Box::new(vec![T::I3(127)]))) }), + ])))) + ].into_iter().collect::>>()))) + ); + } + + } +} diff --git a/pkgs/profpatsch/netencode/spec.md b/pkgs/profpatsch/netencode/spec.md new file mode 100644 index 00000000..cd38588c --- /dev/null +++ b/pkgs/profpatsch/netencode/spec.md @@ -0,0 +1,92 @@ +# encode 0.1-unreleased + +[bencode][] and [netstring][]-inspired pipe format that should be trivial to parse (100 lines of code or less), mostly human-decipherable for easy debugging, and support nested record and sum types. + + +## scalars + +Scalars have the format `[type prefix][size]:[value],`. + +where size is a natural number without leading zeroes. + +### unit + +The unit (`u`) has only one value. + +* The unit is: `u,` + +### numbers + +Naturals (`n`) and Integers (`i`), with a maximum size in bits. + +Bit sizes are specified in 2^n increments, 1 to 9 (`n1`..`n9`, `i1`..`n9`). + +* Natural `1234` that fits in 32 bits (2^5): `n5:1234,` +* Integer `-42` that fits in 8 bits (2^3): `i3:-42,` +* Integer `23` that fits in 64 bits (2^6): `i6:23,` +* Integer `-1` that fits in 512 bits (2^9): `i9:-1,` +* Natural `0` that fits in 1 bit (2^1): `n1:0,` + +An implementation can define the biggest numbers it supports, and has to throw an error for anything bigger. It has to support everything smaller, so for example if you support up to i6/n6, you have to support 1–6 as well. An implementation could support up to the current architecture’s wordsize for example. + +Floats are not supported, you can implement fixed-size decimals or ratios using integers. + +### text + +Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes: + +* The string `hello world` (11 bytes): `t11:hello world,` +* The string `今日は` (9 bytes): `t9:今日は,` +* The string `:,` (2 bytes): `t2::,,` +* The empty sting `` (0 bytes): `t0:,` + +Binary data is not supported, it hinders human readability. Try to use structured data, or use a different format. + +## tagged values + +### tags + +A tag (`<`) gives a value a name. The tag is UTF-8 encoded, starting with its length in bytes and proceeding with the value. + +* The tag `foo` (3 bytes) tagging the text `hello` (5 bytes): `<3:foo|t5:hello,` +* The tag `` (0 bytes) tagging the 8-bit integer 0: `<0:|i3:0,` + +### records (products/records), also maps + +A record (`{`) is a concatenation of tags (`<`). It needs to be closed with `}`. +If tag names repeat the later ones should be ignored. Ordering does not matter. + +Similar to text, records start with the length of their *whole encoded content*, in bytes. This makes it possible to treat their contents as opaque bytestrings. + +* There is no empty record. (TODO: make the empty record the unit type, remove `u,`?) +* A record with one empty field, `foo`: `{9:<3:foo|u,}` +* A record with two fields, `foo` and `x`: `{21:<3:foo|u,<1:x|t3:baz,}` +* The same record: `{21:<1:x|t3:baz,<3:foo|u,}` +* The same record (later occurences of fields are ignored): `{28:<1:x|t3:baz,<3:foo|u,<1:x|u,}` + +### sums (tagged unions) + +Simply a tagged value. The tag marker `<` indicates it is a sum if it appears outside of a record. + +## lists + +A list (`[`) imposes an ordering on a sequence of values. It needs to be closed with `]`. Values in it are simply concatenated. + +Similar to records, lists start with the length of their whole encoded content. + +* The empty list: `[0:]` +* The list with one element, the string `foo`: `[7:t3:foo,]` +* The list with text `foo` followed by i3 `-42`: `[14:t3:foo,i3:-42,]` +* The list with `Some` and `None` tags: `[33:<4:Some|t3:foo,<4None|u,<4None|u,]` + +## motivation + +TODO + +## guarantees + +TODO: do I want unique representation (bijection like bencode?) This would put more restrictions on the generator, like sorting records in lexicographic order, but would make it possible to compare without decoding + + +[bencode]: https://en.wikipedia.org/wiki/Bencode +[netstring]: https://en.wikipedia.org/wiki/Netstring -- cgit 1.4.1