Skip to content

Commit 8e2524a

Browse files
Implement unicode unescaping (#438)
## Product change and motivation Escaped unicode characters can be used in TypeQL string literals. These must be of the form `\uXXXX` (exactly 4 hex digits), or `\u{XX...X}` (1 to 6 hex digits).
1 parent fb6219c commit 8e2524a

3 files changed

Lines changed: 239 additions & 34 deletions

File tree

rust/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ rust_test(
4040
deps = [
4141
"@crates//:syn",
4242
"@crates//:proc-macro2",
43+
"@crates//:rand",
4344
],
4445
)
4546

rust/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ features = {}
1414

1515
[dev-dependencies]
1616

17+
[dev-dependencies.rand]
18+
features = ["alloc", "default", "getrandom", "libc", "rand_chacha", "small_rng", "std", "std_rng"]
19+
version = "0.8.5"
20+
default-features = false
21+
1722
[dev-dependencies.tokio]
1823
features = ["bytes", "default", "fs", "full", "io-std", "io-util", "libc", "macros", "mio", "net", "parking_lot", "process", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"]
1924
version = "1.47.1"

rust/value.rs

Lines changed: 233 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -343,62 +343,72 @@ impl fmt::Display for StructLiteral {
343343

344344
impl StringLiteral {
345345
pub fn unescape(&self) -> Result<String> {
346-
self.process_unescape(|bytes, _buf, rest| match bytes[1] {
347-
BSP => Ok(('\x08', 2)),
348-
TAB => Ok(('\x09', 2)),
349-
LF_ => Ok(('\x0a', 2)),
350-
FF_ => Ok(('\x0c', 2)),
351-
CR_ => Ok(('\x0d', 2)),
352-
c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)),
353-
b'u' => todo!("Unicode escape handling"),
354-
_ => Err(TypeQLError::InvalidStringEscape {
355-
full_string: rest.to_owned(),
356-
escape: format!(r"\{}", rest.chars().nth(1).unwrap()),
346+
self.process_unescape(|bytes| {
347+
if bytes.len() < 2 {
348+
return Err(1);
349+
}
350+
match bytes[1] {
351+
BSP => Ok(('\x08', 2)),
352+
TAB => Ok(('\x09', 2)),
353+
LF_ => Ok(('\x0a', 2)),
354+
FF_ => Ok(('\x0c', 2)),
355+
CR_ => Ok(('\x0d', 2)),
356+
c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)),
357+
b'u' => match decode_unicode_hex_escape(&bytes[2..]) {
358+
Ok((ch, consumed)) => Ok((ch, consumed + 2)),
359+
Err(consumed) => Err(consumed + 2),
360+
},
361+
_ => Err(2),
357362
}
358-
.into()),
359363
})
360364
}
361365

362366
pub fn unescape_regex(&self) -> Result<String> {
363-
self.process_unescape(|bytes, _, _| match bytes[1] {
364-
c @ b'"' => Ok((c as char, 2)),
367+
self.process_unescape(|bytes| match bytes.get(1) {
368+
Some(b'"') => Ok(('"', 2)),
365369
_ => Ok(('\\', 1)),
366370
})
367371
}
368372

369373
fn process_unescape<F>(&self, escape_handler: F) -> Result<String>
370374
where
371-
F: Fn(&[u8], &mut String, &str) -> Result<(char, usize)>,
375+
F: Fn(&[u8]) -> std::result::Result<(char, usize), usize>,
372376
{
373377
let bytes = self.value.as_bytes();
374378
assert_eq!(bytes[0], bytes[bytes.len() - 1]);
375379
assert!(matches!(bytes[0], b'\'' | b'"'));
376380

377381
let escaped_string = &self.value[1..self.value.len() - 1];
378-
let mut buf = String::with_capacity(escaped_string.len());
379-
let mut rest = escaped_string;
380-
382+
let mut buf = Vec::with_capacity(escaped_string.len());
383+
let mut rest = escaped_string.as_bytes();
381384
while !rest.is_empty() {
382-
let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' {
383-
let bytes = rest.as_bytes();
384-
385-
if bytes.len() < 2 {
386-
return Err(TypeQLError::InvalidStringEscape {
387-
full_string: escaped_string.to_owned(),
388-
escape: String::from(r"\"),
385+
if rest[0] == b'\\' {
386+
match escape_handler(rest) {
387+
Ok((char, escaped_len)) => {
388+
let start = buf.len();
389+
buf.resize(buf.len() + char.len_utf8(), 0);
390+
char.encode_utf8(&mut buf[start..]);
391+
rest = &rest[escaped_len..];
392+
}
393+
Err(considered_byte_length) => {
394+
let offset = escaped_string.len() - rest.len();
395+
let mut end = std::cmp::min(offset + considered_byte_length, escaped_string.len());
396+
while !escaped_string.is_char_boundary(end) {
397+
end += 1;
398+
}
399+
return Err(TypeQLError::InvalidStringEscape {
400+
full_string: escaped_string.to_owned(),
401+
escape: escaped_string[offset..end].to_owned(),
402+
}
403+
.into());
389404
}
390-
.into());
391405
}
392-
393-
escape_handler(bytes, &mut buf, escaped_string)?
394406
} else {
395-
let char = rest.chars().next().expect("string is non-empty");
396-
(char, char.len_utf8())
397-
};
398-
buf.push(char);
399-
rest = &rest[escaped_len..];
407+
buf.push(rest[0]);
408+
rest = &rest[1..];
409+
}
400410
}
401-
Ok(buf)
411+
Ok(String::from_utf8(buf).expect("Expected valid utf8").to_owned())
402412
}
403413
}
404414

@@ -407,3 +417,192 @@ const TAB: u8 = b't';
407417
const LF_: u8 = b'n';
408418
const FF_: u8 = b'f';
409419
const CR_: u8 = b'r';
420+
421+
#[allow(arithmetic_overflow)]
422+
fn decode_unicode_hex_escape(bytes: &[u8]) -> std::result::Result<(char, usize), usize> {
423+
if bytes.is_empty() {
424+
Err(0)
425+
} else if bytes[0] == b'{' {
426+
let safe_len = std::cmp::min(bytes.len(), 8);
427+
if let Some(i) = bytes[..safe_len].iter().position(|b| *b == b'}') {
428+
unicode_char_from_hex(&bytes[1..i]).map(|c| (c, i + 1)).ok_or(i + 1)
429+
} else {
430+
Err(safe_len)
431+
}
432+
} else {
433+
if bytes.len() >= 4 {
434+
unicode_char_from_hex(&bytes[0..4]).map(|c| (c, 4)).ok_or(4)
435+
} else {
436+
Err(std::cmp::min(bytes.len(), 4))
437+
}
438+
}
439+
}
440+
441+
fn unicode_char_from_hex(bytes: &[u8]) -> Option<char> {
442+
if bytes.is_empty() || bytes.len() > 6 {
443+
return None;
444+
}
445+
let mut as_u32 = 0u32;
446+
// from_ascii_radix is still experimental
447+
for b in bytes {
448+
as_u32 = (as_u32 << 4) | (*b as char).to_digit(16)?;
449+
}
450+
char::from_u32(as_u32)
451+
}
452+
453+
#[cfg(test)]
454+
pub mod tests {
455+
use crate::{
456+
value::{StringLiteral, TypeQLError},
457+
Result,
458+
};
459+
460+
fn parse_to_string_literal(escaped: &str) -> StringLiteral {
461+
let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else {
462+
panic!("Not parsed as string");
463+
};
464+
parsed
465+
}
466+
467+
#[test]
468+
fn test_unescape_regex() {
469+
{
470+
let escaped = r#""a\"b\"c""#;
471+
let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap();
472+
assert_eq!(unescaped.as_str(), r#"a"b"c"#);
473+
}
474+
{
475+
let escaped = r#""abc\123""#;
476+
let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap();
477+
assert_eq!(unescaped.as_str(), r#"abc\123"#);
478+
}
479+
// Cases that fail at parsing
480+
{
481+
let escaped = r#""abc\""#;
482+
assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal
483+
let string_literal = StringLiteral { value: escaped.to_owned() };
484+
let unescaped = string_literal.unescape_regex().unwrap();
485+
assert_eq!(unescaped.as_str(), r#"abc\"#);
486+
}
487+
}
488+
489+
macro_rules! assert_unescapes_to {
490+
($escaped: expr, $expected: expr) => {
491+
let unescaped = parse_to_string_literal($escaped).unescape().unwrap();
492+
assert_eq!(unescaped, $expected);
493+
};
494+
}
495+
496+
macro_rules! assert_unescape_errors {
497+
($escaped: expr, $expected_escape_sequence: expr) => {
498+
let error = parse_to_string_literal($escaped).unescape().unwrap_err();
499+
let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else {
500+
panic!("Wrong error type. Was {error:?}")
501+
};
502+
assert_eq!(escape, $expected_escape_sequence);
503+
};
504+
}
505+
506+
#[test]
507+
fn test_unescape() {
508+
// Succeeds
509+
assert_unescapes_to!(r#""a\tb\tc""#, "a\tb\tc"); // works
510+
assert_unescapes_to!(r#""a\"b\"c""#, r#"a"b"c"#); // works
511+
assert_unescapes_to!(r#""a\'b\'c""#, r#"a'b'c"#); // works
512+
assert_unescapes_to!(r#""a\\b\\c""#, r#"a\b\c"#); // works
513+
// - Unicode
514+
assert_unescapes_to!(r#""abc \u0ca0\u005f\u0ca0""#, "abc ಠ_ಠ"); // works
515+
assert_unescapes_to!(r#""abc \u0CA0\u005F\u0CA0""#, "abc ಠ_ಠ"); // caps
516+
assert_unescapes_to!(r#""abc \u0CA01234""#, "abc ಠ1234"); // consumes only 4
517+
assert_unescapes_to!(r#""abc \u{0CA0}1234""#, "abc ಠ1234"); // braces with only 4
518+
assert_unescapes_to!(r#""abc \u{130ED}\u{13153}1234""#, "abc 𓃭𓅓1234"); // braces with 6
519+
520+
// Errors
521+
assert_unescape_errors!(r#""ab\c""#, r"\c"); // Invalid escape
522+
523+
// - Unicode
524+
assert_unescape_errors!(r#""abc \u""#, r"\u"); // Not enough bytes
525+
assert_unescape_errors!(r#""abc \u012""#, r"\u012"); // Not enough bytes
526+
assert_unescape_errors!(r#""abc \uwu/ abc""#, r"\uwu/ "); // Invalid hex
527+
assert_unescape_errors!(r#""abc \uΣ12Σ abc""#, r"\uΣ12"); // Invalid hex, 3 chars more than 4 bytes
528+
assert_unescape_errors!(r#""abc \u123Σ abc""#, r"\u123Σ"); // Invalid hex, 4 chars more than 4 bytes
529+
assert_unescape_errors!(r#""abc \u{""#, r"\u{"); // Not enough bytes
530+
assert_unescape_errors!(r#""abc \u{123Σ} abc""#, r"\u{123Σ}"); // Invalid hex with braces
531+
assert_unescape_errors!(r#""abc \u{1234567} abc""#, r"\u{1234567"); // Too many characters, stop at 8
532+
assert_unescape_errors!(r#""abc \u{213456} abc""#, r"\u{213456}"); // Above valid range
533+
534+
// Cases that fail at parsing
535+
{
536+
let escaped = r#""abc\""#;
537+
assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal
538+
let string_literal = StringLiteral { value: escaped.to_owned() };
539+
let error = string_literal.unescape().unwrap_err();
540+
let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else {
541+
panic!("Wrong error type. Was {error:?}")
542+
};
543+
assert_eq!(escape, r#"\"#);
544+
}
545+
}
546+
547+
#[ignore]
548+
#[test]
549+
fn time_unescape_ascii() {
550+
let text = generate_string(TIME_UNESCAPE_TEXT_LEN, |x| 32 + (x % 94));
551+
time_unescape(text);
552+
}
553+
554+
#[ignore]
555+
#[test]
556+
fn time_unescape_unicode() {
557+
// assert_eq!(None, (0..0x07ff).filter(|x| char::from_u32(*x).is_none()).next());
558+
let text = generate_string(TIME_UNESCAPE_TEXT_LEN, move |x| x & 0x07ff);
559+
time_unescape(text);
560+
}
561+
562+
const TIME_UNESCAPE_TEXT_LEN: usize = 100000;
563+
fn time_unescape(text: String) {
564+
use std::time::Instant;
565+
let iters = 10000;
566+
567+
let string_literal = StringLiteral { value: text };
568+
let start = Instant::now();
569+
for _ in 0..iters {
570+
string_literal.unescape().unwrap();
571+
}
572+
let end = Instant::now();
573+
println!(
574+
"{iters} on string of length {} iters in {}",
575+
string_literal.value.as_str().len(),
576+
(end - start).as_secs_f64()
577+
)
578+
}
579+
580+
fn generate_string(length: usize, mapper: fn(u32) -> u32) -> String {
581+
use rand::{thread_rng, Rng, RngCore};
582+
let mut rng = thread_rng();
583+
let capacity: i64 = (1.2 * length as f64).ceil() as i64;
584+
let mut text = String::with_capacity(capacity as usize);
585+
text.push('"');
586+
587+
for _ in 0..capacity {
588+
if text.len() > length {
589+
break;
590+
}
591+
match char::from_u32(mapper(rng.next_u32())) {
592+
Some('\\') => text += r"\\",
593+
Some('\'') => text += r"\'",
594+
Some('\"') => text += r#"\""#,
595+
Some('\x08') => text += r"\b",
596+
Some('\x09') => text += r"\t",
597+
Some('\x0a') => text += r"\n",
598+
Some('\x0c') => text += r"\f",
599+
Some('\x0d') => text += r"\r",
600+
Some(ch) => text.push(ch),
601+
None => (),
602+
}
603+
}
604+
text.push('"');
605+
assert!(text.len() > length && text.len() < length + 10);
606+
text
607+
}
608+
}

0 commit comments

Comments
 (0)