@@ -343,62 +343,72 @@ impl fmt::Display for StructLiteral {
343343
344344impl StringLiteral {
345345 pub fn unescape ( & self ) -> Result < String > {
346- self . process_unescape ( |bytes, _buf, rest| match bytes[ 1 ] {
347- BSP => Ok ( ( '\x08' , 2 ) ) ,
348- TAB => Ok ( ( '\x09' , 2 ) ) ,
349- LF_ => Ok ( ( '\x0a' , 2 ) ) ,
350- FF_ => Ok ( ( '\x0c' , 2 ) ) ,
351- CR_ => Ok ( ( '\x0d' , 2 ) ) ,
352- c @ ( b'"' | b'\'' | b'\\' ) => Ok ( ( c as char , 2 ) ) ,
353- b'u' => todo ! ( "Unicode escape handling" ) ,
354- _ => Err ( TypeQLError :: InvalidStringEscape {
355- full_string : rest. to_owned ( ) ,
356- escape : format ! ( r"\{}" , rest. chars( ) . nth( 1 ) . unwrap( ) ) ,
346+ self . process_unescape ( |bytes| {
347+ if bytes. len ( ) < 2 {
348+ return Err ( 1 ) ;
349+ }
350+ match bytes[ 1 ] {
351+ BSP => Ok ( ( '\x08' , 2 ) ) ,
352+ TAB => Ok ( ( '\x09' , 2 ) ) ,
353+ LF_ => Ok ( ( '\x0a' , 2 ) ) ,
354+ FF_ => Ok ( ( '\x0c' , 2 ) ) ,
355+ CR_ => Ok ( ( '\x0d' , 2 ) ) ,
356+ c @ ( b'"' | b'\'' | b'\\' ) => Ok ( ( c as char , 2 ) ) ,
357+ b'u' => match decode_unicode_hex_escape ( & bytes[ 2 ..] ) {
358+ Ok ( ( ch, consumed) ) => Ok ( ( ch, consumed + 2 ) ) ,
359+ Err ( consumed) => Err ( consumed + 2 ) ,
360+ } ,
361+ _ => Err ( 2 ) ,
357362 }
358- . into ( ) ) ,
359363 } )
360364 }
361365
362366 pub fn unescape_regex ( & self ) -> Result < String > {
363- self . process_unescape ( |bytes, _ , _ | match bytes[ 1 ] {
364- c @ b'"' => Ok ( ( c as char , 2 ) ) ,
367+ self . process_unescape ( |bytes| match bytes. get ( 1 ) {
368+ Some ( b'"' ) => Ok ( ( '"' , 2 ) ) ,
365369 _ => Ok ( ( '\\' , 1 ) ) ,
366370 } )
367371 }
368372
369373 fn process_unescape < F > ( & self , escape_handler : F ) -> Result < String >
370374 where
371- F : Fn ( & [ u8 ] , & mut String , & str ) -> Result < ( char , usize ) > ,
375+ F : Fn ( & [ u8 ] ) -> std :: result :: Result < ( char , usize ) , usize > ,
372376 {
373377 let bytes = self . value . as_bytes ( ) ;
374378 assert_eq ! ( bytes[ 0 ] , bytes[ bytes. len( ) - 1 ] ) ;
375379 assert ! ( matches!( bytes[ 0 ] , b'\'' | b'"' ) ) ;
376380
377381 let escaped_string = & self . value [ 1 ..self . value . len ( ) - 1 ] ;
378- let mut buf = String :: with_capacity ( escaped_string. len ( ) ) ;
379- let mut rest = escaped_string;
380-
382+ let mut buf = Vec :: with_capacity ( escaped_string. len ( ) ) ;
383+ let mut rest = escaped_string. as_bytes ( ) ;
381384 while !rest. is_empty ( ) {
382- let ( char, escaped_len) = if rest. as_bytes ( ) [ 0 ] == b'\\' {
383- let bytes = rest. as_bytes ( ) ;
384-
385- if bytes. len ( ) < 2 {
386- return Err ( TypeQLError :: InvalidStringEscape {
387- full_string : escaped_string. to_owned ( ) ,
388- escape : String :: from ( r"\" ) ,
385+ if rest[ 0 ] == b'\\' {
386+ match escape_handler ( rest) {
387+ Ok ( ( char, escaped_len) ) => {
388+ let start = buf. len ( ) ;
389+ buf. resize ( buf. len ( ) + char. len_utf8 ( ) , 0 ) ;
390+ char. encode_utf8 ( & mut buf[ start..] ) ;
391+ rest = & rest[ escaped_len..] ;
392+ }
393+ Err ( considered_byte_length) => {
394+ let offset = escaped_string. len ( ) - rest. len ( ) ;
395+ let mut end = std:: cmp:: min ( offset + considered_byte_length, escaped_string. len ( ) ) ;
396+ while !escaped_string. is_char_boundary ( end) {
397+ end += 1 ;
398+ }
399+ return Err ( TypeQLError :: InvalidStringEscape {
400+ full_string : escaped_string. to_owned ( ) ,
401+ escape : escaped_string[ offset..end] . to_owned ( ) ,
402+ }
403+ . into ( ) ) ;
389404 }
390- . into ( ) ) ;
391405 }
392-
393- escape_handler ( bytes, & mut buf, escaped_string) ?
394406 } else {
395- let char = rest. chars ( ) . next ( ) . expect ( "string is non-empty" ) ;
396- ( char, char. len_utf8 ( ) )
397- } ;
398- buf. push ( char) ;
399- rest = & rest[ escaped_len..] ;
407+ buf. push ( rest[ 0 ] ) ;
408+ rest = & rest[ 1 ..] ;
409+ }
400410 }
401- Ok ( buf)
411+ Ok ( String :: from_utf8 ( buf) . expect ( "Expected valid utf8" ) . to_owned ( ) )
402412 }
403413}
404414
@@ -407,3 +417,192 @@ const TAB: u8 = b't';
407417const LF_ : u8 = b'n' ;
408418const FF_ : u8 = b'f' ;
409419const CR_ : u8 = b'r' ;
420+
421+ #[ allow( arithmetic_overflow) ]
422+ fn decode_unicode_hex_escape ( bytes : & [ u8 ] ) -> std:: result:: Result < ( char , usize ) , usize > {
423+ if bytes. is_empty ( ) {
424+ Err ( 0 )
425+ } else if bytes[ 0 ] == b'{' {
426+ let safe_len = std:: cmp:: min ( bytes. len ( ) , 8 ) ;
427+ if let Some ( i) = bytes[ ..safe_len] . iter ( ) . position ( |b| * b == b'}' ) {
428+ unicode_char_from_hex ( & bytes[ 1 ..i] ) . map ( |c| ( c, i + 1 ) ) . ok_or ( i + 1 )
429+ } else {
430+ Err ( safe_len)
431+ }
432+ } else {
433+ if bytes. len ( ) >= 4 {
434+ unicode_char_from_hex ( & bytes[ 0 ..4 ] ) . map ( |c| ( c, 4 ) ) . ok_or ( 4 )
435+ } else {
436+ Err ( std:: cmp:: min ( bytes. len ( ) , 4 ) )
437+ }
438+ }
439+ }
440+
441+ fn unicode_char_from_hex ( bytes : & [ u8 ] ) -> Option < char > {
442+ if bytes. is_empty ( ) || bytes. len ( ) > 6 {
443+ return None ;
444+ }
445+ let mut as_u32 = 0u32 ;
446+ // from_ascii_radix is still experimental
447+ for b in bytes {
448+ as_u32 = ( as_u32 << 4 ) | ( * b as char ) . to_digit ( 16 ) ?;
449+ }
450+ char:: from_u32 ( as_u32)
451+ }
452+
453+ #[ cfg( test) ]
454+ pub mod tests {
455+ use crate :: {
456+ value:: { StringLiteral , TypeQLError } ,
457+ Result ,
458+ } ;
459+
460+ fn parse_to_string_literal ( escaped : & str ) -> StringLiteral {
461+ let crate :: ValueLiteral :: String ( parsed) = crate :: parse_value ( escaped) . unwrap ( ) else {
462+ panic ! ( "Not parsed as string" ) ;
463+ } ;
464+ parsed
465+ }
466+
467+ #[ test]
468+ fn test_unescape_regex ( ) {
469+ {
470+ let escaped = r#""a\"b\"c""# ;
471+ let unescaped = parse_to_string_literal ( escaped) . unescape_regex ( ) . unwrap ( ) ;
472+ assert_eq ! ( unescaped. as_str( ) , r#"a"b"c"# ) ;
473+ }
474+ {
475+ let escaped = r#""abc\123""# ;
476+ let unescaped = parse_to_string_literal ( escaped) . unescape_regex ( ) . unwrap ( ) ;
477+ assert_eq ! ( unescaped. as_str( ) , r#"abc\123"# ) ;
478+ }
479+ // Cases that fail at parsing
480+ {
481+ let escaped = r#""abc\""# ;
482+ assert ! ( crate :: parse_value( escaped) . is_err( ) ) ; // Parsing fails as incomplete string literal
483+ let string_literal = StringLiteral { value : escaped. to_owned ( ) } ;
484+ let unescaped = string_literal. unescape_regex ( ) . unwrap ( ) ;
485+ assert_eq ! ( unescaped. as_str( ) , r#"abc\"# ) ;
486+ }
487+ }
488+
489+ macro_rules! assert_unescapes_to {
490+ ( $escaped: expr, $expected: expr) => {
491+ let unescaped = parse_to_string_literal( $escaped) . unescape( ) . unwrap( ) ;
492+ assert_eq!( unescaped, $expected) ;
493+ } ;
494+ }
495+
496+ macro_rules! assert_unescape_errors {
497+ ( $escaped: expr, $expected_escape_sequence: expr) => {
498+ let error = parse_to_string_literal( $escaped) . unescape( ) . unwrap_err( ) ;
499+ let TypeQLError :: InvalidStringEscape { escape, .. } = & error. errors( ) [ 0 ] else {
500+ panic!( "Wrong error type. Was {error:?}" )
501+ } ;
502+ assert_eq!( escape, $expected_escape_sequence) ;
503+ } ;
504+ }
505+
506+ #[ test]
507+ fn test_unescape ( ) {
508+ // Succeeds
509+ assert_unescapes_to ! ( r#""a\tb\tc""# , "a\t b\t c" ) ; // works
510+ assert_unescapes_to ! ( r#""a\"b\"c""# , r#"a"b"c"# ) ; // works
511+ assert_unescapes_to ! ( r#""a\'b\'c""# , r#"a'b'c"# ) ; // works
512+ assert_unescapes_to ! ( r#""a\\b\\c""# , r#"a\b\c"# ) ; // works
513+ // - Unicode
514+ assert_unescapes_to ! ( r#""abc \u0ca0\u005f\u0ca0""# , "abc ಠ_ಠ" ) ; // works
515+ assert_unescapes_to ! ( r#""abc \u0CA0\u005F\u0CA0""# , "abc ಠ_ಠ" ) ; // caps
516+ assert_unescapes_to ! ( r#""abc \u0CA01234""# , "abc ಠ1234" ) ; // consumes only 4
517+ assert_unescapes_to ! ( r#""abc \u{0CA0}1234""# , "abc ಠ1234" ) ; // braces with only 4
518+ assert_unescapes_to ! ( r#""abc \u{130ED}\u{13153}1234""# , "abc 𓃭𓅓1234" ) ; // braces with 6
519+
520+ // Errors
521+ assert_unescape_errors ! ( r#""ab\c""# , r"\c" ) ; // Invalid escape
522+
523+ // - Unicode
524+ assert_unescape_errors ! ( r#""abc \u""# , r"\u" ) ; // Not enough bytes
525+ assert_unescape_errors ! ( r#""abc \u012""# , r"\u012" ) ; // Not enough bytes
526+ assert_unescape_errors ! ( r#""abc \uwu/ abc""# , r"\uwu/ " ) ; // Invalid hex
527+ assert_unescape_errors ! ( r#""abc \uΣ12Σ abc""# , r"\uΣ12" ) ; // Invalid hex, 3 chars more than 4 bytes
528+ assert_unescape_errors ! ( r#""abc \u123Σ abc""# , r"\u123Σ" ) ; // Invalid hex, 4 chars more than 4 bytes
529+ assert_unescape_errors ! ( r#""abc \u{""# , r"\u{" ) ; // Not enough bytes
530+ assert_unescape_errors ! ( r#""abc \u{123Σ} abc""# , r"\u{123Σ}" ) ; // Invalid hex with braces
531+ assert_unescape_errors ! ( r#""abc \u{1234567} abc""# , r"\u{1234567" ) ; // Too many characters, stop at 8
532+ assert_unescape_errors ! ( r#""abc \u{213456} abc""# , r"\u{213456}" ) ; // Above valid range
533+
534+ // Cases that fail at parsing
535+ {
536+ let escaped = r#""abc\""# ;
537+ assert ! ( crate :: parse_value( escaped) . is_err( ) ) ; // Parsing fails as incomplete string literal
538+ let string_literal = StringLiteral { value : escaped. to_owned ( ) } ;
539+ let error = string_literal. unescape ( ) . unwrap_err ( ) ;
540+ let TypeQLError :: InvalidStringEscape { escape, .. } = & error. errors ( ) [ 0 ] else {
541+ panic ! ( "Wrong error type. Was {error:?}" )
542+ } ;
543+ assert_eq ! ( escape, r#"\"# ) ;
544+ }
545+ }
546+
547+ #[ ignore]
548+ #[ test]
549+ fn time_unescape_ascii ( ) {
550+ let text = generate_string ( TIME_UNESCAPE_TEXT_LEN , |x| 32 + ( x % 94 ) ) ;
551+ time_unescape ( text) ;
552+ }
553+
554+ #[ ignore]
555+ #[ test]
556+ fn time_unescape_unicode ( ) {
557+ // assert_eq!(None, (0..0x07ff).filter(|x| char::from_u32(*x).is_none()).next());
558+ let text = generate_string ( TIME_UNESCAPE_TEXT_LEN , move |x| x & 0x07ff ) ;
559+ time_unescape ( text) ;
560+ }
561+
562+ const TIME_UNESCAPE_TEXT_LEN : usize = 100000 ;
563+ fn time_unescape ( text : String ) {
564+ use std:: time:: Instant ;
565+ let iters = 10000 ;
566+
567+ let string_literal = StringLiteral { value : text } ;
568+ let start = Instant :: now ( ) ;
569+ for _ in 0 ..iters {
570+ string_literal. unescape ( ) . unwrap ( ) ;
571+ }
572+ let end = Instant :: now ( ) ;
573+ println ! (
574+ "{iters} on string of length {} iters in {}" ,
575+ string_literal. value. as_str( ) . len( ) ,
576+ ( end - start) . as_secs_f64( )
577+ )
578+ }
579+
580+ fn generate_string ( length : usize , mapper : fn ( u32 ) -> u32 ) -> String {
581+ use rand:: { thread_rng, Rng , RngCore } ;
582+ let mut rng = thread_rng ( ) ;
583+ let capacity: i64 = ( 1.2 * length as f64 ) . ceil ( ) as i64 ;
584+ let mut text = String :: with_capacity ( capacity as usize ) ;
585+ text. push ( '"' ) ;
586+
587+ for _ in 0 ..capacity {
588+ if text. len ( ) > length {
589+ break ;
590+ }
591+ match char:: from_u32 ( mapper ( rng. next_u32 ( ) ) ) {
592+ Some ( '\\' ) => text += r"\\" ,
593+ Some ( '\'' ) => text += r"\'" ,
594+ Some ( '\"' ) => text += r#"\""# ,
595+ Some ( '\x08' ) => text += r"\b" ,
596+ Some ( '\x09' ) => text += r"\t" ,
597+ Some ( '\x0a' ) => text += r"\n" ,
598+ Some ( '\x0c' ) => text += r"\f" ,
599+ Some ( '\x0d' ) => text += r"\r" ,
600+ Some ( ch) => text. push ( ch) ,
601+ None => ( ) ,
602+ }
603+ }
604+ text. push ( '"' ) ;
605+ assert ! ( text. len( ) > length && text. len( ) < length + 10 ) ;
606+ text
607+ }
608+ }
0 commit comments