pkgs/profpatsch/netencode: add binary type

author: Profpatsch <mail@profpatsch.de> 2020-06-26 23:29:59 +0200
committer: Profpatsch <mail@profpatsch.de> 2020-06-26 23:33:43 +0200
commit: b21008f5d7b62fabc3d6fddec29f39dd80a89d60 (patch)
tree: 2f4d94724464555d430d5432e5cd0c0e2fea1655 /pkgs/profpatsch/netencode
parent: 6314fe724c31c54bc53bf15ebcc9c2ce9769583c (diff)
2 files changed, 47 insertions, 3 deletions
diff --git a/pkgs/profpatsch/netencode/netencode.rs b/pkgs/profpatsch/netencode/netencode.rs
index a3744946..0f7e1aa2 100644
--- a/pkgs/profpatsch/netencode/netencode.rs
+++ b/pkgs/profpatsch/netencode/netencode.rs
@@ -20,6 +20,7 @@ pub enum T {
     // Text
     // TODO: make into &str
     Text(String),
+    Binary(Vec<u8>),
     // Tags
     // TODO: make into &str
     Sum(Tag<String, Box<T>>),
@@ -43,6 +44,7 @@ pub enum U<'a> {
     I7(i128),
     // Text
     Text(&'a [u8]),
+    Binary(&'a [u8]),
     // Tags
     Sum(Tag<&'a str, Box<U<'a>>>),
     Record(HashMap<&'a str, Box<U<'a>>>),
@@ -83,6 +85,11 @@ pub fn encode<W: Write>(w: &mut W, t: T) -> std::io::Result<()> {
       T::I6(i) => write!(w, "i6:{},", i),
       T::I7(i) => write!(w, "i7:{},", i),
       T::Text(s) => write!(w, "t{}:{},", s.len(), s),
+      T::Binary(s) => {
+          write!(w, "b{}:", s.len());
+          w.write(&s);
+          write!(w, ",")
+      },
       T::Sum(Tag{tag, val}) => encode_tag(w, tag, *val),
       T::Record(m) => {
           let mut c = std::io::Cursor::new(vec![]);
@@ -248,6 +255,14 @@ pub mod parse {
         sized('t', ',')
     }
 
+    fn binary<'a>() -> impl Fn(&'a [u8]) -> IResult<&'a [u8], T> {
+        map(binary_g(), |b| T::Binary(b.to_owned()))
+    }
+
+    fn binary_g() -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
+        sized('b', ',')
+    }
+
     fn list_t(s: &[u8]) -> IResult<&[u8], Vec<T>> {
         map_parser(list_g(), nom::multi::many0(t_t))(s)
     }
@@ -444,6 +459,23 @@ pub mod parse {
         }
 
         #[test]
+        fn test_parse_binary() {
+            assert_eq!(
+                binary()("b5:hello,".as_bytes()),
+                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned()))))
+            );
+            assert_eq!(
+                binary()("b4:fo,".as_bytes()),
+                // TODO: way better parse error messages
+                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+            );
+            assert_eq!(
+                binary()("b9:今日は,".as_bytes()),
+                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes()))))
+            );
+        }
+
+        #[test]
         fn test_list() {
             assert_eq!(
                 list_t("[0:]".as_bytes()),
diff --git a/pkgs/profpatsch/netencode/spec.md b/pkgs/profpatsch/netencode/spec.md
index e680034f..d1cc15c1 100644
--- a/pkgs/profpatsch/netencode/spec.md
+++ b/pkgs/profpatsch/netencode/spec.md
@@ -1,4 +1,4 @@
-# encode 0.1-unreleased
+# netencode 0.1-unreleased
 
 [bencode][] and [netstring][]-inspired pipe format that should be trivial to parse (100 lines of code or less), mostly human-decipherable for easy debugging, and support nested record and sum types.
 
@@ -42,7 +42,6 @@ TODO: should we add `f,` and `t,`?
 
 ### text
 
-
 Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes:
 
 * The string `hello world` (11 bytes): `t11:hello world,`
@@ -50,7 +49,20 @@ Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes:
 * The string `:,` (2 bytes): `t2::,,`
 * The empty sting `` (0 bytes): `t0:,`
 
-TODO: add `b` for binary content. Even filesystem paths are not utf-8 encodable sometimes, yet the distinction of text with an encoding is useful, so we should keep `t` as is.
+### binary
+:LOGBOOK:
+CLOCK: [2020-06-26 Fr 23:21]
+:END:
+
+Arbitrary binary strings (`b`) that can contain any data, starting with its length in bytes.
+
+* The ASCII string `hello world` as binary data (11 bytes): `b11:hello world,`
+* The empty binary string (0 bytes): `b0:,`
+* The bytestring with `^D` (1 byte): `b1:,`
+
+Since the binary strings are length-prefixd, they can contain `\0` and no escaping is required. Care has to be taken in languages with `\0`-terminated bytestrings.
+
+Use text (`t`) if you have utf-8 encoded data.
 
 ## tagged values
author	Profpatsch <mail@profpatsch.de>	2020-06-26 23:29:59 +0200
committer	Profpatsch <mail@profpatsch.de>	2020-06-26 23:33:43 +0200
commit	b21008f5d7b62fabc3d6fddec29f39dd80a89d60 (patch)
tree	2f4d94724464555d430d5432e5cd0c0e2fea1655 /pkgs/profpatsch/netencode
parent	6314fe724c31c54bc53bf15ebcc9c2ce9769583c (diff)