Skip to content

Commit 79a7446

Browse files
committed
Use exact submatch sizing in regex replace_all
1 parent 76d8732 commit 79a7446

2 files changed

Lines changed: 67 additions & 67 deletions

File tree

vibes/builtins.go

Lines changed: 53 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"strconv"
1212
"strings"
1313
"time"
14+
"unicode/utf8"
1415
)
1516

1617
const (
@@ -495,83 +496,68 @@ func builtinRegexReplaceInternal(args []Value, kwargs map[string]Value, block Va
495496
}
496497

497498
func regexReplaceAllWithLimit(re *regexp.Regexp, text string, replacement string, method string) (string, error) {
498-
literalBytesPerMatch, refCountPerMatch := regexReplacementTemplateStats(replacement)
499-
matchCount := int64(0)
500-
totalMatchBytes := int64(0)
501-
referenceMatchBytes := int64(0)
502-
503-
// Use regexp's native replacement scanner so anchoring/boundary semantics
504-
// match the final replace_all behavior while estimating worst-case output.
505-
re.ReplaceAllStringFunc(text, func(match string) string {
506-
matchCount++
507-
matchBytes := int64(len(match))
508-
totalMatchBytes += matchBytes
509-
referenceMatchBytes += matchBytes * int64(refCountPerMatch)
510-
return match
511-
})
512-
513-
upperBound := int64(len(text)) - totalMatchBytes + matchCount*int64(literalBytesPerMatch) + referenceMatchBytes
514-
if upperBound > int64(maxRegexInputBytes) {
515-
return "", fmt.Errorf("%s output exceeds limit %d bytes", method, maxRegexInputBytes)
499+
out := make([]byte, 0, len(text))
500+
lastAppended := 0
501+
searchStart := 0
502+
for searchStart <= len(text) {
503+
loc, found := nextRegexReplaceAllSubmatchIndex(re, text, searchStart)
504+
if !found {
505+
break
506+
}
507+
508+
segmentLen := loc[0] - lastAppended
509+
if len(out) > maxRegexInputBytes-segmentLen {
510+
return "", fmt.Errorf("%s output exceeds limit %d bytes", method, maxRegexInputBytes)
511+
}
512+
out = append(out, text[lastAppended:loc[0]]...)
513+
out = re.ExpandString(out, replacement, text, loc)
514+
if len(out) > maxRegexInputBytes {
515+
return "", fmt.Errorf("%s output exceeds limit %d bytes", method, maxRegexInputBytes)
516+
}
517+
lastAppended = loc[1]
518+
519+
if loc[1] > searchStart {
520+
searchStart = loc[1]
521+
continue
522+
}
523+
if searchStart >= len(text) {
524+
break
525+
}
526+
_, size := utf8.DecodeRuneInString(text[searchStart:])
527+
if size == 0 {
528+
size = 1
529+
}
530+
searchStart += size
516531
}
517532

518-
out := re.ReplaceAllString(text, replacement)
519-
if len(out) > maxRegexInputBytes {
533+
tailLen := len(text) - lastAppended
534+
if len(out) > maxRegexInputBytes-tailLen {
520535
return "", fmt.Errorf("%s output exceeds limit %d bytes", method, maxRegexInputBytes)
521536
}
522-
return out, nil
537+
out = append(out, text[lastAppended:]...)
538+
return string(out), nil
523539
}
524540

525-
func regexReplacementTemplateStats(replacement string) (literalBytes int, referenceCount int) {
526-
for i := 0; i < len(replacement); {
527-
if replacement[i] != '$' {
528-
literalBytes++
529-
i++
530-
continue
531-
}
532-
if i+1 >= len(replacement) {
533-
literalBytes++
534-
i++
535-
continue
536-
}
537-
if replacement[i+1] == '$' {
538-
literalBytes++
539-
i += 2
541+
func nextRegexReplaceAllSubmatchIndex(re *regexp.Regexp, text string, start int) ([]int, bool) {
542+
windowStart := start
543+
if windowStart > 0 {
544+
windowStart--
545+
}
546+
locs := re.FindAllStringSubmatchIndex(text[windowStart:], 2)
547+
for _, loc := range locs {
548+
absStart := loc[0] + windowStart
549+
if absStart < start {
540550
continue
541551
}
542-
if replacement[i+1] == '{' {
543-
j := i + 2
544-
for j < len(replacement) && replacement[j] != '}' {
545-
j++
546-
}
547-
if j < len(replacement) {
548-
referenceCount++
549-
i = j + 1
552+
abs := make([]int, len(loc))
553+
for i, index := range loc {
554+
if index < 0 {
555+
abs[i] = -1
550556
continue
551557
}
552-
literalBytes++
553-
i++
554-
continue
558+
abs[i] = index + windowStart
555559
}
556-
557-
j := i + 1
558-
for j < len(replacement) && isRegexReplacementNameByte(replacement[j]) {
559-
j++
560-
}
561-
if j == i+1 {
562-
literalBytes++
563-
i++
564-
continue
565-
}
566-
referenceCount++
567-
i = j
560+
return abs, true
568561
}
569-
return literalBytes, referenceCount
570-
}
571-
572-
func isRegexReplacementNameByte(char byte) bool {
573-
return (char >= '0' && char <= '9') ||
574-
(char >= 'a' && char <= 'z') ||
575-
(char >= 'A' && char <= 'Z') ||
576-
char == '_'
562+
return nil, false
577563
}

vibes/runtime_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,6 +2097,20 @@ func TestJSONAndRegexSizeGuards(t *testing.T) {
20972097
if err == nil || !strings.Contains(err.Error(), "Regex.replace_all output exceeds limit") {
20982098
t.Fatalf("expected Regex.replace_all output guard error, got %v", err)
20992099
}
2100+
2101+
largeRun := strings.Repeat("a", maxRegexInputBytes-1024)
2102+
replaced, err := script.Call(
2103+
context.Background(),
2104+
"regex_replace_all_guard",
2105+
[]Value{NewString(largeRun), NewString("(a)[a]*"), NewString("$1$1")},
2106+
CallOptions{},
2107+
)
2108+
if err != nil {
2109+
t.Fatalf("expected large capture replacement to succeed, got %v", err)
2110+
}
2111+
if replaced.Kind() != KindString || replaced.String() != "aa" {
2112+
t.Fatalf("expected capture replacement to produce \"aa\", got %v", replaced)
2113+
}
21002114
}
21012115

21022116
func TestLocaleSensitiveOperationsDeterministic(t *testing.T) {

0 commit comments

Comments
 (0)