Implement unicode unescaping (#438)

krishnangovindraj · web-flow · commit 8e2524a46839 · 2026-04-09T14:43:43.000+02:00
## Product change and motivation
Escaped unicode characters can be used in TypeQL string literals. These must be of the form `\uXXXX` (exactly 4 hex digits), or `\u{XX...X}` (1 to 6 hex digits).
diff --git a/rust/BUILD b/rust/BUILD
@@ -40,6 +40,7 @@ rust_test(
     deps = [
         "@crates//:syn",
         "@crates//:proc-macro2",
+        "@crates//:rand",
     ],
 )
 
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -14,6 +14,11 @@ features = {}
 
 [dev-dependencies]
 
+	[dev-dependencies.rand]
+		features = ["alloc", "default", "getrandom", "libc", "rand_chacha", "small_rng", "std", "std_rng"]
+		version = "0.8.5"
+		default-features = false
+
 	[dev-dependencies.tokio]
 		features = ["bytes", "default", "fs", "full", "io-std", "io-util", "libc", "macros", "mio", "net", "parking_lot", "process", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"]
 		version = "1.47.1"
diff --git a/rust/value.rs b/rust/value.rs
@@ -343,62 +343,72 @@ impl fmt::Display for StructLiteral {
 
 impl StringLiteral {
     pub fn unescape(&self) -> Result<String> {
-        self.process_unescape(|bytes, _buf, rest| match bytes[1] {
-            BSP => Ok(('\x08', 2)),
-            TAB => Ok(('\x09', 2)),
-            LF_ => Ok(('\x0a', 2)),
-            FF_ => Ok(('\x0c', 2)),
-            CR_ => Ok(('\x0d', 2)),
-            c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)),
-            b'u' => todo!("Unicode escape handling"),
-            _ => Err(TypeQLError::InvalidStringEscape {
-                full_string: rest.to_owned(),
-                escape: format!(r"\{}", rest.chars().nth(1).unwrap()),
+        self.process_unescape(|bytes| {
+            if bytes.len() < 2 {
+                return Err(1);
+            }
+            match bytes[1] {
+                BSP => Ok(('\x08', 2)),
+                TAB => Ok(('\x09', 2)),
+                LF_ => Ok(('\x0a', 2)),
+                FF_ => Ok(('\x0c', 2)),
+                CR_ => Ok(('\x0d', 2)),
+                c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)),
+                b'u' => match decode_unicode_hex_escape(&bytes[2..]) {
+                    Ok((ch, consumed)) => Ok((ch, consumed + 2)),
+                    Err(consumed) => Err(consumed + 2),
+                },
+                _ => Err(2),
             }
-            .into()),
         })
     }
 
     pub fn unescape_regex(&self) -> Result<String> {
-        self.process_unescape(|bytes, _, _| match bytes[1] {
-            c @ b'"' => Ok((c as char, 2)),
+        self.process_unescape(|bytes| match bytes.get(1) {
+            Some(b'"') => Ok(('"', 2)),
             _ => Ok(('\\', 1)),
         })
     }
 
     fn process_unescape<F>(&self, escape_handler: F) -> Result<String>
     where
-        F: Fn(&[u8], &mut String, &str) -> Result<(char, usize)>,
+        F: Fn(&[u8]) -> std::result::Result<(char, usize), usize>,
     {
         let bytes = self.value.as_bytes();
         assert_eq!(bytes[0], bytes[bytes.len() - 1]);
         assert!(matches!(bytes[0], b'\'' | b'"'));
 
         let escaped_string = &self.value[1..self.value.len() - 1];
-        let mut buf = String::with_capacity(escaped_string.len());
-        let mut rest = escaped_string;
-
+        let mut buf = Vec::with_capacity(escaped_string.len());
+        let mut rest = escaped_string.as_bytes();
         while !rest.is_empty() {
-            let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' {
-                let bytes = rest.as_bytes();
-
-                if bytes.len() < 2 {
-                    return Err(TypeQLError::InvalidStringEscape {
-                        full_string: escaped_string.to_owned(),
-                        escape: String::from(r"\"),
+            if rest[0] == b'\\' {
+                match escape_handler(rest) {
+                    Ok((char, escaped_len)) => {
+                        let start = buf.len();
+                        buf.resize(buf.len() + char.len_utf8(), 0);
+                        char.encode_utf8(&mut buf[start..]);
+                        rest = &rest[escaped_len..];
+                    }
+                    Err(considered_byte_length) => {
+                        let offset = escaped_string.len() - rest.len();
+                        let mut end = std::cmp::min(offset + considered_byte_length, escaped_string.len());
+                        while !escaped_string.is_char_boundary(end) {
+                            end += 1;
+                        }
+                        return Err(TypeQLError::InvalidStringEscape {
+                            full_string: escaped_string.to_owned(),
+                            escape: escaped_string[offset..end].to_owned(),
+                        }
+                        .into());
                     }
-                    .into());
                 }
-
-                escape_handler(bytes, &mut buf, escaped_string)?
             } else {
-                let char = rest.chars().next().expect("string is non-empty");
-                (char, char.len_utf8())
-            };
-            buf.push(char);
-            rest = &rest[escaped_len..];
+                buf.push(rest[0]);
+                rest = &rest[1..];
+            }
         }
-        Ok(buf)
+        Ok(String::from_utf8(buf).expect("Expected valid utf8").to_owned())
     }
 }
 
@@ -407,3 +417,192 @@ const TAB: u8 = b't';
 const LF_: u8 = b'n';
 const FF_: u8 = b'f';
 const CR_: u8 = b'r';
+
+#[allow(arithmetic_overflow)]
+fn decode_unicode_hex_escape(bytes: &[u8]) -> std::result::Result<(char, usize), usize> {
+    if bytes.is_empty() {
+        Err(0)
+    } else if bytes[0] == b'{' {
+        let safe_len = std::cmp::min(bytes.len(), 8);
+        if let Some(i) = bytes[..safe_len].iter().position(|b| *b == b'}') {
+            unicode_char_from_hex(&bytes[1..i]).map(|c| (c, i + 1)).ok_or(i + 1)
+        } else {
+            Err(safe_len)
+        }
+    } else {
+        if bytes.len() >= 4 {
+            unicode_char_from_hex(&bytes[0..4]).map(|c| (c, 4)).ok_or(4)
+        } else {
+            Err(std::cmp::min(bytes.len(), 4))
+        }
+    }
+}
+
+fn unicode_char_from_hex(bytes: &[u8]) -> Option<char> {
+    if bytes.is_empty() || bytes.len() > 6 {
+        return None;
+    }
+    let mut as_u32 = 0u32;
+    // from_ascii_radix is still experimental
+    for b in bytes {
+        as_u32 = (as_u32 << 4) | (*b as char).to_digit(16)?;
+    }
+    char::from_u32(as_u32)
+}
+
+#[cfg(test)]
+pub mod tests {
+    use crate::{
+        value::{StringLiteral, TypeQLError},
+        Result,
+    };
+
+    fn parse_to_string_literal(escaped: &str) -> StringLiteral {
+        let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else {
+            panic!("Not parsed as string");
+        };
+        parsed
+    }
+
+    #[test]
+    fn test_unescape_regex() {
+        {
+            let escaped = r#""a\"b\"c""#;
+            let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap();
+            assert_eq!(unescaped.as_str(), r#"a"b"c"#);
+        }
+        {
+            let escaped = r#""abc\123""#;
+            let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap();
+            assert_eq!(unescaped.as_str(), r#"abc\123"#);
+        }
+        // Cases that fail at parsing
+        {
+            let escaped = r#""abc\""#;
+            assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal
+            let string_literal = StringLiteral { value: escaped.to_owned() };
+            let unescaped = string_literal.unescape_regex().unwrap();
+            assert_eq!(unescaped.as_str(), r#"abc\"#);
+        }
+    }
+
+    macro_rules! assert_unescapes_to {
+        ($escaped: expr, $expected: expr) => {
+            let unescaped = parse_to_string_literal($escaped).unescape().unwrap();
+            assert_eq!(unescaped, $expected);
+        };
+    }
+
+    macro_rules! assert_unescape_errors {
+        ($escaped: expr, $expected_escape_sequence: expr) => {
+            let error = parse_to_string_literal($escaped).unescape().unwrap_err();
+            let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else {
+                panic!("Wrong error type. Was {error:?}")
+            };
+            assert_eq!(escape, $expected_escape_sequence);
+        };
+    }
+
+    #[test]
+    fn test_unescape() {
+        // Succeeds
+        assert_unescapes_to!(r#""a\tb\tc""#, "a\tb\tc"); // works
+        assert_unescapes_to!(r#""a\"b\"c""#, r#"a"b"c"#); // works
+        assert_unescapes_to!(r#""a\'b\'c""#, r#"a'b'c"#); // works
+        assert_unescapes_to!(r#""a\\b\\c""#, r#"a\b\c"#); // works
+                                                          //  - Unicode
+        assert_unescapes_to!(r#""abc \u0ca0\u005f\u0ca0""#, "abc ಠ_ಠ"); // works
+        assert_unescapes_to!(r#""abc \u0CA0\u005F\u0CA0""#, "abc ಠ_ಠ"); // caps
+        assert_unescapes_to!(r#""abc \u0CA01234""#, "abc ಠ1234"); // consumes only 4
+        assert_unescapes_to!(r#""abc \u{0CA0}1234""#, "abc ಠ1234"); // braces with only 4
+        assert_unescapes_to!(r#""abc \u{130ED}\u{13153}1234""#, "abc 𓃭𓅓1234"); // braces with 6
+
+        // Errors
+        assert_unescape_errors!(r#""ab\c""#, r"\c"); // Invalid escape
+
+        //  - Unicode
+        assert_unescape_errors!(r#""abc \u""#, r"\u"); // Not enough bytes
+        assert_unescape_errors!(r#""abc \u012""#, r"\u012"); // Not enough bytes
+        assert_unescape_errors!(r#""abc \uwu/ abc""#, r"\uwu/ "); // Invalid hex
+        assert_unescape_errors!(r#""abc \uΣ12Σ abc""#, r"\uΣ12"); // Invalid hex, 3 chars more than 4 bytes
+        assert_unescape_errors!(r#""abc \u123Σ abc""#, r"\u123Σ"); // Invalid hex, 4 chars more than 4 bytes
+        assert_unescape_errors!(r#""abc \u{""#, r"\u{"); // Not enough bytes
+        assert_unescape_errors!(r#""abc \u{123Σ} abc""#, r"\u{123Σ}"); // Invalid hex with braces
+        assert_unescape_errors!(r#""abc \u{1234567} abc""#, r"\u{1234567"); // Too many characters, stop at 8
+        assert_unescape_errors!(r#""abc \u{213456} abc""#, r"\u{213456}"); // Above valid range
+
+        // Cases that fail at parsing
+        {
+            let escaped = r#""abc\""#;
+            assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal
+            let string_literal = StringLiteral { value: escaped.to_owned() };
+            let error = string_literal.unescape().unwrap_err();
+            let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else {
+                panic!("Wrong error type. Was {error:?}")
+            };
+            assert_eq!(escape, r#"\"#);
+        }
+    }
+
+    #[ignore]
+    #[test]
+    fn time_unescape_ascii() {
+        let text = generate_string(TIME_UNESCAPE_TEXT_LEN, |x| 32 + (x % 94));
+        time_unescape(text);
+    }
+
+    #[ignore]
+    #[test]
+    fn time_unescape_unicode() {
+        // assert_eq!(None, (0..0x07ff).filter(|x| char::from_u32(*x).is_none()).next());
+        let text = generate_string(TIME_UNESCAPE_TEXT_LEN, move |x| x & 0x07ff);
+        time_unescape(text);
+    }
+
+    const TIME_UNESCAPE_TEXT_LEN: usize = 100000;
+    fn time_unescape(text: String) {
+        use std::time::Instant;
+        let iters = 10000;
+
+        let string_literal = StringLiteral { value: text };
+        let start = Instant::now();
+        for _ in 0..iters {
+            string_literal.unescape().unwrap();
+        }
+        let end = Instant::now();
+        println!(
+            "{iters} on string of length {} iters in {}",
+            string_literal.value.as_str().len(),
+            (end - start).as_secs_f64()
+        )
+    }
+
+    fn generate_string(length: usize, mapper: fn(u32) -> u32) -> String {
+        use rand::{thread_rng, Rng, RngCore};
+        let mut rng = thread_rng();
+        let capacity: i64 = (1.2 * length as f64).ceil() as i64;
+        let mut text = String::with_capacity(capacity as usize);
+        text.push('"');
+
+        for _ in 0..capacity {
+            if text.len() > length {
+                break;
+            }
+            match char::from_u32(mapper(rng.next_u32())) {
+                Some('\\') => text += r"\\",
+                Some('\'') => text += r"\'",
+                Some('\"') => text += r#"\""#,
+                Some('\x08') => text += r"\b",
+                Some('\x09') => text += r"\t",
+                Some('\x0a') => text += r"\n",
+                Some('\x0c') => text += r"\f",
+                Some('\x0d') => text += r"\r",
+                Some(ch) => text.push(ch),
+                None => (),
+            }
+        }
+        text.push('"');
+        assert!(text.len() > length && text.len() < length + 10);
+        text
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ rust_test(`
`40`	`40`	`deps = [`
`41`	`41`	`"@crates//:syn",`
`42`	`42`	`"@crates//:proc-macro2",`
	`43`	`+ "@crates//:rand",`
`43`	`44`	`],`
`44`	`45`	`)`
`45`	`46`