Skip to content

Commit 41d9d85

Browse files
authored
regex.pcre: add non-greedy quantifiers like *?, +?, ?? (fix #26579) (#26582)
1 parent d002b57 commit 41d9d85

3 files changed

Lines changed: 220 additions & 42 deletions

File tree

‎vlib/regex/pcre/README.md‎

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and iterative searching.
1616
| **Wildcard** | `.` | Matches any character (excluding `\n` unless `(?s)` flag is used). |
1717
| **Alternation** | `|` | Matches the left OR right expression (e.g., `cat|dog`). |
1818
| **Quantifiers** | `*` | Matches 0 or more times. |
19+
| **Non-greedy quantifiers** | `*?`, `+?`, `??` | Avoid to consume as much as possible. |
1920
| | `+` | Matches 1 or more times. |
2021
| | `?` | Matches 0 or 1 time. |
2122
| | `{m}` | Matches exactly `m` times. |
@@ -244,6 +245,26 @@ println(year) // Output: 2025
244245

245246
## Advanced Usage
246247

248+
### Non-greedy Matching
249+
By default, quantifiers like `*` and `+` are **greedy**, meaning they match
250+
as much text as possible. Adding a `?` makes them **non-greedy** (or lazy),
251+
matching the shortest possible string.
252+
253+
**Example:**
254+
```v
255+
import regex.pcre
256+
257+
text := '<div>content</div>'
258+
259+
// Greedy: Matches everything from the first '<' to the last '>'
260+
r_greedy := pcre.compile(r'<.*>')!
261+
println(r_greedy.find(text)?.text) // Output: <div>content</div>
262+
263+
// Non-greedy: Matches only until the first '>'
264+
r_lazy := pcre.compile(r'<.*?>')!
265+
println(r_lazy.find(text)?.text) // Output: <div>
266+
```
267+
247268
### VM Stability (No Stack Overflow)
248269
Because this engine uses a VM with a heap-allocated stack, it can handle patterns that typically
249270
crash recursive engines due to stack overflow.

‎vlib/regex/pcre/regex.v‎

Lines changed: 85 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
regex2 0.9.2 beta (VM Edition)
2+
regex2 0.9.3 beta (VM Edition)
33
44
Copyright (c) 2026 Dario Deledda. All rights reserved.
55
Use of this source code is governed by an MIT license
@@ -12,6 +12,7 @@ Features:
1212
- UTF8 support
1313
- Literal characters, '.', '*', '{m,n}'
1414
- Short quantifiers: '?', '+'
15+
- Non-greedy quantifiers: '*?', '+?', '??'
1516
- Nested groups: '()'
1617
- Named groups: '(?P<name>...)'
1718
- Non-capturing groups: '(?:...)'
@@ -112,31 +113,14 @@ pub:
112113
groups []string
113114
}
114115

115-
// get retrieves the captured text by index.
116-
// Index 0 returns the whole match, 1+ returns capture groups.
117-
pub fn (m Match) get(idx int) ?string {
118-
if idx == 0 {
119-
return m.text
120-
}
121-
if idx > 0 && idx <= m.groups.len {
122-
return m.groups[idx - 1]
123-
}
124-
return none
125-
}
126-
127-
// get_all returns the whole match followed by all capture groups.
128-
pub fn (m Match) get_all() []string {
129-
mut res := [m.text]
130-
res << m.groups
131-
return res
132-
}
133-
134116
// --- AST Nodes ---
135117

136118
// Quantifier represents a repetition range.
137119
struct Quantifier {
138-
min int
139-
max int // -1 for infinity
120+
mut:
121+
min int
122+
max int // -1 for infinity
123+
greedy bool // true = greedy, false = lazy
140124
}
141125

142126
// Flags holds the current state of regex options.
@@ -260,8 +244,9 @@ pub fn compile(pattern string) !Regex {
260244
nodes: nodes
261245
group_capture_index: -1
262246
quant: Quantifier{
263-
min: 1
264-
max: 1
247+
min: 1
248+
max: 1
249+
greedy: true
265250
}
266251
}
267252

@@ -280,13 +265,6 @@ pub fn compile(pattern string) !Regex {
280265
}
281266
}
282267

283-
// new_regex is an alias for compile, for compatibility with older PCRE wrappers.
284-
// Note: The second argument (flags) is currently ignored as flags should be
285-
// embedded in the pattern (e.g., '(?i)pattern').
286-
pub fn new_regex(pattern string, _ int) !Regex {
287-
return compile(pattern)
288-
}
289-
290268
// Compiler manages the state of the bytecode generation process.
291269
struct Compiler {
292270
mut:
@@ -314,23 +292,46 @@ fn (mut c Compiler) emit_node(node Node) {
314292
c.emit_single_node_logic(node)
315293

316294
c.emit(Inst{ typ: .jmp, target_x: split_idx })
317-
c.prog[split_idx].target_x = start_pc
318-
c.prog[split_idx].target_y = c.prog.len
295+
296+
// In this VM, target_x is the first path taken (stack.pop order depends on implementation,
297+
// but standard here is target_x executed immediately, target_y pushed to stack).
298+
// Greedy: Prefer matching loop (start_pc) over exit.
299+
// Lazy: Prefer exit over matching loop.
300+
if node.quant.greedy {
301+
c.prog[split_idx].target_x = start_pc // Loop
302+
c.prog[split_idx].target_y = c.prog.len // Exit
303+
} else {
304+
c.prog[split_idx].target_x = c.prog.len // Exit
305+
c.prog[split_idx].target_y = start_pc // Loop
306+
}
319307
} else if node.quant.max > node.quant.min {
320308
// Finite range
321309
rem := node.quant.max - node.quant.min
322310
mut splits := []int{}
323311

324312
for _ in 0 .. rem {
325313
idx := c.emit(Inst{ typ: .split })
326-
c.prog[idx].target_x = c.prog.len
314+
match_pc := c.prog.len
315+
316+
// If greedy, we prefer to match the node (continue execution)
317+
// If lazy, we prefer to skip the match (jump to end)
318+
if node.quant.greedy {
319+
c.prog[idx].target_x = match_pc // Match node
320+
} else {
321+
c.prog[idx].target_y = match_pc // Match node (fallback)
322+
}
323+
327324
c.emit_single_node_logic(node)
328325
splits << idx
329326
}
330327

331328
end_pc := c.prog.len
332329
for idx in splits {
333-
c.prog[idx].target_y = end_pc
330+
if node.quant.greedy {
331+
c.prog[idx].target_y = end_pc // Skip match (fallback)
332+
} else {
333+
c.prog[idx].target_x = end_pc // Skip match (primary)
334+
}
334335
}
335336
}
336337
}
@@ -467,7 +468,7 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
467468
Node{
468469
typ: .alternation
469470
alternatives: alternatives
470-
quant: Quantifier{1, 1}
471+
quant: Quantifier{1, 1, true}
471472
},
472473
], pos, group_counter
473474
}
@@ -714,20 +715,20 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
714715
}
715716

716717
if parsed_nodes.len > 0 {
717-
mut q := Quantifier{1, 1}
718+
mut q := Quantifier{1, 1, true}
718719
if pos < pattern.len {
719720
peek, pl := read_rune(pattern, pos)
720721
match peek {
721722
`*` {
722-
q = Quantifier{0, -1}
723+
q = Quantifier{0, -1, true}
723724
pos += pl
724725
}
725726
`+` {
726-
q = Quantifier{1, -1}
727+
q = Quantifier{1, -1, true}
727728
pos += pl
728729
}
729730
`?` {
730-
q = Quantifier{0, 1}
731+
q = Quantifier{0, 1, true}
731732
pos += pl
732733
}
733734
`{` {
@@ -751,11 +752,21 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
751752
-1
752753
}
753754
}
754-
q = Quantifier{min, max}
755+
q = Quantifier{min, max, true}
755756
pos = end_q + 1
756757
}
757758
else {}
758759
}
760+
761+
// Check for Non-Greedy modifier '?'
762+
// e.g. *?, +?, ??, {m,n}?
763+
if pos < pattern.len {
764+
peek_lazy, pl_lazy := read_rune(pattern, pos)
765+
if peek_lazy == `?` {
766+
q.greedy = false
767+
pos += pl_lazy
768+
}
769+
}
759770
}
760771
parsed_nodes[parsed_nodes.len - 1].quant = q
761772
}
@@ -772,7 +783,7 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
772783
Node{
773784
typ: .alternation
774785
alternatives: alternatives
775-
quant: Quantifier{1, 1}
786+
quant: Quantifier{1, 1, true}
776787
},
777788
], pos, group_counter
778789
}
@@ -1185,7 +1196,39 @@ pub fn (r Regex) find_from(text string, start_index int) ?Match {
11851196
return none
11861197
}
11871198

1199+
/******************************************************************************
1200+
*
1201+
* C PCRE compatibility layer
1202+
*
1203+
******************************************************************************/
1204+
1205+
// new_regex is an alias for compile, for compatibility with older PCRE wrappers.
1206+
// Note: The second argument (flags) is currently ignored as flags should be
1207+
// embedded in the pattern (e.g., '(?i)pattern').
1208+
pub fn new_regex(pattern string, _ int) !Regex {
1209+
return compile(pattern)
1210+
}
1211+
11881212
// match_str is an alias for find_from, for compatibility with older PCRE wrappers.
11891213
pub fn (r Regex) match_str(text string, start_index int, _ int) ?Match {
11901214
return r.find_from(text, start_index)
11911215
}
1216+
1217+
// get retrieves the captured text by index.
1218+
// Index 0 returns the whole match, 1+ returns capture groups.
1219+
pub fn (m Match) get(idx int) ?string {
1220+
if idx == 0 {
1221+
return m.text
1222+
}
1223+
if idx > 0 && idx <= m.groups.len {
1224+
return m.groups[idx - 1]
1225+
}
1226+
return none
1227+
}
1228+
1229+
// get_all returns the whole match followed by all capture groups.
1230+
pub fn (m Match) get_all() []string {
1231+
mut res := [m.text]
1232+
res << m.groups
1233+
return res
1234+
}

‎vlib/regex/pcre/regex_test.v‎

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,3 +520,117 @@ fn tst_compile_error(pattern string) {
520520
println('Error: Did not get a compilation error!')
521521
assert false
522522
}
523+
524+
fn test_non_greedy_quantifiers() {
525+
println('\n--- Testing Non-Greedy Quantifiers (*?, +?, ??, {m,n}?) ---')
526+
527+
// 1. Lazy Star (*?)
528+
// Should stop at the first closing '>' (minimal match)
529+
tst_find(r'<.*?>', '<div>content</div>', '<div>')
530+
// Contrast with greedy (default) which consumes until the last '>'
531+
tst_find(r'<.*>', '<div>content</div>', '<div>content</div>')
532+
533+
// 2. Lazy Plus (+?)
534+
// Should match minimal characters (1 'a') to satisfy the constraint
535+
tst_find(r'a+?', 'aaaaa', 'a')
536+
// Forced expansion: Must match all 'a's to finally match 'b' (backtracking test)
537+
tst_find(r'a+?b', 'aaab', 'aaab')
538+
539+
// 3. Lazy Question Mark (??)
540+
// Should match empty string (prefers 0 occurrences over 1)
541+
tst_find(r'a??', 'a', '')
542+
// Contextual: 'u' is lazy (prefers skip), matches 'color' immediately
543+
tst_find(r'colou??r', 'color', 'color')
544+
// Contextual: 'u' is lazy, tries skip, fails to match 'r', backtracks to match 'u'
545+
tst_find(r'colou??r', 'colour', 'colour')
546+
547+
// 4. Lazy Range ({m,n}?)
548+
// Should match minimum required (2 digits)
549+
tst_find(r'\d{2,5}?', '123456789', '12')
550+
// Contrast with greedy which matches maximum (5 digits)
551+
tst_find(r'\d{2,5}', '123456789', '12345')
552+
553+
// 5. Complex/Real-world Case (User report)
554+
// Escaped characters + lazy capture group
555+
// Should match only '$t(common.hello)', not the span to the second ')'
556+
tst_find(r'\$t\((.*?)\)', r'$t(common.hello) dear $t(common.name)', r'$t(common.hello)')
557+
558+
// --- Negative / Edge Cases ---
559+
560+
// Lazy quantifier with no termination in string should match nothing/min if possible,
561+
// but since it's "find", it grabs the first valid match.
562+
tst_find(r'x.*?y', 'x123y456y', 'x123y') // Stops at first y
563+
564+
// Anchor interaction: ^.*?b
565+
// Matches from start, .*? expands lazily until it hits 'b'
566+
tst_find(r'^.*?b', '123b', '123b')
567+
568+
// Ensure lazy doesn't cause failure when a greedy match would succeed (correct backtracking)
569+
// Pattern wants to match "a" lazily, but must consume "a" to satisfy the final "a"
570+
tst_find(r'a?a', 'a', 'a')
571+
tst_find(r'a??a', 'a', 'a')
572+
}
573+
574+
fn test_compatibility_layer() {
575+
// Test new_regex (alias for compile)
576+
// Passing '0' as the second argument to simulate the ignored C-flag argument
577+
pattern := r'(\w+)\s+(\d+)'
578+
re := pcre.new_regex(pattern, 0) or {
579+
assert false, 'new_regex failed to compile: ${err}'
580+
return
581+
}
582+
583+
text := 'item 42 ignored item 99'
584+
585+
// Test match_str (alias for find_from)
586+
// We start searching from index 0. The third argument '0' is the ignored option flag.
587+
// This should match "item 42"
588+
m1 := re.match_str(text, 0, 0) or {
589+
assert false, 'match_str failed to find match'
590+
return
591+
}
592+
593+
// Test get()
594+
// Index 0 should be the full text of the match
595+
full_match := m1.get(0) or { '' }
596+
assert full_match == 'item 42'
597+
598+
// Index 1 should be the first capture group (\w+)
599+
group_1 := m1.get(1) or { '' }
600+
assert group_1 == 'item'
601+
602+
// Index 2 should be the second capture group (\d+)
603+
group_2 := m1.get(2) or { '' }
604+
assert group_2 == '42'
605+
606+
// Index 3 should be none (out of bounds)
607+
if _ := m1.get(3) {
608+
assert false, 'get(3) should return none for 2 groups'
609+
}
610+
611+
// Test get_all()
612+
// Should return ['item 42', 'item', '42']
613+
all_captures := m1.get_all()
614+
assert all_captures.len == 3
615+
assert all_captures[0] == 'item 42'
616+
assert all_captures[1] == 'item'
617+
assert all_captures[2] == '42'
618+
619+
// Test match_str with a specific start index
620+
// Start searching after "item 42" (length is 7)
621+
// This should match "item 99"
622+
m2 := re.match_str(text, 7, 0) or {
623+
assert false, 'match_str failed to find second match from offset'
624+
return
625+
}
626+
627+
assert m2.get(0) or { '' } == 'item 99'
628+
assert m2.get(2) or { '' } == '99'
629+
630+
// Test match_str failure case
631+
// Start searching at the very end of string
632+
no_match := re.match_str(text, text.len, 0)
633+
if _ := no_match {
634+
assert false, 'match_str should return none when no match is found'
635+
}
636+
}

0 commit comments

Comments
 (0)