11/*
2- regex2 0.9.2 beta (VM Edition)
2+ regex2 0.9.3 beta (VM Edition)
33
44Copyright (c) 2026 Dario Deledda. All rights reserved.
55Use of this source code is governed by an MIT license
@@ -12,6 +12,7 @@ Features:
1212 - UTF8 support
1313 - Literal characters, '.', '*', '{m,n}'
1414 - Short quantifiers: '?', '+'
15+ - Non-greedy quantifiers: '*?', '+?', '??'
1516 - Nested groups: '()'
1617 - Named groups: '(?P<name>...)'
1718 - Non-capturing groups: '(?:...)'
@@ -112,31 +113,14 @@ pub:
112113 groups []string
113114}
114115
115- // get retrieves the captured text by index.
116- // Index 0 returns the whole match, 1+ returns capture groups.
117- pub fn (m Match) get (idx int ) ? string {
118- if idx == 0 {
119- return m.text
120- }
121- if idx > 0 && idx < = m.groups.len {
122- return m.groups[idx - 1 ]
123- }
124- return none
125- }
126-
127- // get_all returns the whole match followed by all capture groups.
128- pub fn (m Match) get_all () []string {
129- mut res := [m.text]
130- res << m.groups
131- return res
132- }
133-
134116// --- AST Nodes ---
135117
136118// Quantifier represents a repetition range.
137119struct Quantifier {
138- min int
139- max int // -1 for infinity
120+ mut :
121+ min int
122+ max int // -1 for infinity
123+ greedy bool // true = greedy, false = lazy
140124}
141125
142126// Flags holds the current state of regex options.
@@ -260,8 +244,9 @@ pub fn compile(pattern string) !Regex {
260244 nodes: nodes
261245 group_capture_index: - 1
262246 quant: Quantifier{
263- min: 1
264- max: 1
247+ min: 1
248+ max: 1
249+ greedy: true
265250 }
266251 }
267252
@@ -280,13 +265,6 @@ pub fn compile(pattern string) !Regex {
280265 }
281266}
282267
283- // new_regex is an alias for compile, for compatibility with older PCRE wrappers.
284- // Note: The second argument (flags) is currently ignored as flags should be
285- // embedded in the pattern (e.g., '(?i)pattern').
286- pub fn new_regex (pattern string , _ int ) ! Regex {
287- return compile (pattern)
288- }
289-
290268// Compiler manages the state of the bytecode generation process.
291269struct Compiler {
292270mut :
@@ -314,23 +292,46 @@ fn (mut c Compiler) emit_node(node Node) {
314292 c.emit_single_node_logic (node)
315293
316294 c.emit (Inst{ typ: .jmp, target_x: split_idx })
317- c.prog[split_idx].target_x = start_pc
318- c.prog[split_idx].target_y = c.prog.len
295+
296+ // In this VM, target_x is the first path taken (stack.pop order depends on implementation,
297+ // but standard here is target_x executed immediately, target_y pushed to stack).
298+ // Greedy: Prefer matching loop (start_pc) over exit.
299+ // Lazy: Prefer exit over matching loop.
300+ if node.quant.greedy {
301+ c.prog[split_idx].target_x = start_pc // Loop
302+ c.prog[split_idx].target_y = c.prog.len // Exit
303+ } else {
304+ c.prog[split_idx].target_x = c.prog.len // Exit
305+ c.prog[split_idx].target_y = start_pc // Loop
306+ }
319307 } else if node.quant.max > node.quant.min {
320308 // Finite range
321309 rem := node.quant.max - node.quant.min
322310 mut splits := []int {}
323311
324312 for _ in 0 .. rem {
325313 idx := c.emit (Inst{ typ: .split })
326- c.prog[idx].target_x = c.prog.len
314+ match_pc := c.prog.len
315+
316+ // If greedy, we prefer to match the node (continue execution)
317+ // If lazy, we prefer to skip the match (jump to end)
318+ if node.quant.greedy {
319+ c.prog[idx].target_x = match_pc // Match node
320+ } else {
321+ c.prog[idx].target_y = match_pc // Match node (fallback)
322+ }
323+
327324 c.emit_single_node_logic (node)
328325 splits << idx
329326 }
330327
331328 end_pc := c.prog.len
332329 for idx in splits {
333- c.prog[idx].target_y = end_pc
330+ if node.quant.greedy {
331+ c.prog[idx].target_y = end_pc // Skip match (fallback)
332+ } else {
333+ c.prog[idx].target_x = end_pc // Skip match (primary)
334+ }
334335 }
335336 }
336337}
@@ -467,7 +468,7 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
467468 Node{
468469 typ: .alternation
469470 alternatives: alternatives
470- quant: Quantifier{1 , 1 }
471+ quant: Quantifier{1 , 1 , true }
471472 },
472473 ], pos, group_counter
473474 }
@@ -714,20 +715,20 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
714715 }
715716
716717 if parsed_nodes.len > 0 {
717- mut q := Quantifier{1 , 1 }
718+ mut q := Quantifier{1 , 1 , true }
718719 if pos < pattern.len {
719720 peek , pl := read_rune (pattern, pos)
720721 match peek {
721722 `*` {
722- q = Quantifier{0 , - 1 }
723+ q = Quantifier{0 , - 1 , true }
723724 pos + = pl
724725 }
725726 `+` {
726- q = Quantifier{1 , - 1 }
727+ q = Quantifier{1 , - 1 , true }
727728 pos + = pl
728729 }
729730 `?` {
730- q = Quantifier{0 , 1 }
731+ q = Quantifier{0 , 1 , true }
731732 pos + = pl
732733 }
733734 `{` {
@@ -751,11 +752,21 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
751752 - 1
752753 }
753754 }
754- q = Quantifier{min, max}
755+ q = Quantifier{min, max, true }
755756 pos = end_q + 1
756757 }
757758 else {}
758759 }
760+
761+ // Check for Non-Greedy modifier '?'
762+ // e.g. *?, +?, ??, {m,n}?
763+ if pos < pattern.len {
764+ peek_lazy , pl_lazy := read_rune (pattern, pos)
765+ if peek_lazy == `?` {
766+ q.greedy = false
767+ pos + = pl_lazy
768+ }
769+ }
759770 }
760771 parsed_nodes[parsed_nodes.len - 1 ].quant = q
761772 }
@@ -772,7 +783,7 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
772783 Node{
773784 typ: .alternation
774785 alternatives: alternatives
775- quant: Quantifier{1 , 1 }
786+ quant: Quantifier{1 , 1 , true }
776787 },
777788 ], pos, group_counter
778789 }
@@ -1185,7 +1196,39 @@ pub fn (r Regex) find_from(text string, start_index int) ?Match {
11851196 return none
11861197}
11871198
1199+ /* *****************************************************************************
1200+ *
1201+ * C PCRE compatibility layer
1202+ *
1203+ ******************************************************************************/
1204+
1205+ // new_regex is an alias for compile, for compatibility with older PCRE wrappers.
1206+ // Note: The second argument (flags) is currently ignored as flags should be
1207+ // embedded in the pattern (e.g., '(?i)pattern').
1208+ pub fn new_regex (pattern string , _ int ) ! Regex {
1209+ return compile (pattern)
1210+ }
1211+
11881212// match_str is an alias for find_from, for compatibility with older PCRE wrappers.
11891213pub fn (r Regex) match_str (text string , start_index int , _ int ) ? Match {
11901214 return r.find_from (text, start_index)
11911215}
1216+
1217+ // get retrieves the captured text by index.
1218+ // Index 0 returns the whole match, 1+ returns capture groups.
1219+ pub fn (m Match) get (idx int ) ? string {
1220+ if idx == 0 {
1221+ return m.text
1222+ }
1223+ if idx > 0 && idx < = m.groups.len {
1224+ return m.groups[idx - 1 ]
1225+ }
1226+ return none
1227+ }
1228+
1229+ // get_all returns the whole match followed by all capture groups.
1230+ pub fn (m Match) get_all () []string {
1231+ mut res := [m.text]
1232+ res << m.groups
1233+ return res
1234+ }
0 commit comments