|
| 1 | +# regex.pcre Module Documentation |
| 2 | + |
| 3 | +The `regex.pcre` module provides a **Virtual Machine (VM)** based regular expression engine with |
| 4 | +UTF-8 support. |
| 5 | +Unlike recursive engines, this implementation uses an explicit heap stack, |
| 6 | +making it safe for complex patterns and long strings without risking stack overflows. |
| 7 | + |
| 8 | +It supports compilation of patterns, searching, full matching, global replacement, named groups, |
| 9 | +and iterative searching. |
| 10 | + |
| 11 | +## Supported Syntax |
| 12 | + |
| 13 | +| Feature | Syntax | Description | |
| 14 | +| :--- | :--- | :--- | |
| 15 | +| **Literals** | `abc` | Matches exact characters. | |
| 16 | +| **Wildcard** | `.` | Matches any character (excluding `\n` unless `(?s)` flag is used). | |
| 17 | +| **Alternation** | `|` | Matches the left OR right expression (e.g., `cat|dog`). | |
| 18 | +| **Quantifiers** | `*` | Matches 0 or more times. | |
| 19 | +| | `+` | Matches 1 or more times. | |
| 20 | +| | `?` | Matches 0 or 1 time. | |
| 21 | +| | `{m}` | Matches exactly `m` times. | |
| 22 | +| | `{m,n}` | Matches between `m` and `n` times. | |
| 23 | +| **Groups** | `(...)` | Capturing group. | |
| 24 | +| | `(?:...)` | Non-capturing group. | |
| 25 | +| | `(?P<name>...)` | Named capturing group. | |
| 26 | +| **Anchors** | `^` | Matches start of string (or line start with `(?m)`). | |
| 27 | +| | `$` | Matches end of string (or line end with `(?m)`). | |
| 28 | +| | `\b` | Matches a word boundary (start/end of word). | |
| 29 | +| | `\B` | Matches a non-word boundary. | |
| 30 | +| **Classes** | `[abc]` | Matches any character in the set. | |
| 31 | +| | `[^abc]` | Matches any character NOT in the set. | |
| 32 | +| | `[a-z]` | Matches a range of characters. | |
| 33 | +| | `\w`, `\W` | Word / Non-word character (`[a-zA-Z0-9_]`). | |
| 34 | +| | `\d`, `\D` | Digit / Non-digit. | |
| 35 | +| | `\s`, `\S` | Whitespace / Non-whitespace. | |
| 36 | +| | `\a` | Lowercase character (`[a-z]`). | |
| 37 | +| | `\A` | Uppercase character (`[A-Z]`). | |
| 38 | +| **Escapes** | `\xHH` | Matches 1-byte hex value. | |
| 39 | +| | `\XHHHH` | Matches 2-byte hex value. | |
| 40 | +| **Flags** | `(?i)` | Case-insensitive matching. | |
| 41 | +| | `(?m)` | Multiline mode (`^` and `$` match start/end of lines). | |
| 42 | +| | `(?s)` | Dot-all mode (`.` matches `\n`). | |
| 43 | + |
| 44 | +## Structs |
| 45 | + |
| 46 | +### Regex |
| 47 | +The compiled regular expression object containing the VM bytecode. |
| 48 | +```v ignore |
| 49 | +pub struct Regex { |
| 50 | +pub: |
| 51 | + pattern string |
| 52 | + total_groups int |
| 53 | + // Internal VM bytecode... |
| 54 | +} |
| 55 | +``` |
| 56 | + |
| 57 | +### Match |
| 58 | +Represents the result of a successful search. |
| 59 | +```v ignore |
| 60 | +pub struct Match { |
| 61 | +pub: |
| 62 | + text string // The full substring that matched |
| 63 | + start int // Start index in the source text |
| 64 | + end int // End index in the source text |
| 65 | + groups []string // List of captured groups |
| 66 | +} |
| 67 | +``` |
| 68 | + |
| 69 | +--- |
| 70 | + |
| 71 | +## Core Functions |
| 72 | + |
| 73 | +### `compile` |
| 74 | + |
| 75 | +Compiles a regular expression pattern string into a `Regex` object. Returns an error if the syntax |
| 76 | +is invalid (e.g., unclosed groups). |
| 77 | + |
| 78 | +```v ignore |
| 79 | +fn compile(pattern string) !Regex |
| 80 | +``` |
| 81 | + |
| 82 | +**Example:** |
| 83 | +```v ignore |
| 84 | +import regex.pcre |
| 85 | +
|
| 86 | +fn main() { |
| 87 | + // Compile a pattern to match a word followed by digits |
| 88 | + // The '?' after pcre.compile handles the result option |
| 89 | + r := pcre.compile(r'\w+\d+') or { panic(err) } |
| 90 | +} |
| 91 | +``` |
| 92 | + |
| 93 | +--- |
| 94 | + |
| 95 | +### `find` |
| 96 | + |
| 97 | +Scans the text for the **first** occurrence of the pattern. Returns a `Match` object if found, |
| 98 | +or `none` if not. |
| 99 | + |
| 100 | +```v ignore |
| 101 | +fn (r Regex) find(text string) ?Match |
| 102 | +``` |
| 103 | + |
| 104 | +**Example:** |
| 105 | +```v ignore |
| 106 | +r := pcre.compile(r'(\d+)')! |
| 107 | +text := "item 123, item 456" |
| 108 | +
|
| 109 | +if m := r.find(text) { |
| 110 | + println('Found: ${m.text}') // Output: 123 |
| 111 | + println('Index: ${m.start}') // Output: 5 |
| 112 | + println('Group 1: ${m.groups[0]}') // Output: 123 |
| 113 | +} |
| 114 | +``` |
| 115 | + |
| 116 | +> **Note:** This function stops immediately after finding the leftmost match. |
| 117 | +
|
| 118 | +--- |
| 119 | + |
| 120 | +### `find_all` |
| 121 | + |
| 122 | +Returns a list of **all non-overlapping** matches in the string. This is useful for extracting |
| 123 | +multiple tokens. |
| 124 | + |
| 125 | +```v ignore |
| 126 | +fn (r Regex) find_all(text string) []Match |
| 127 | +``` |
| 128 | + |
| 129 | +**Example:** |
| 130 | +```v ignore |
| 131 | +r := pcre.compile(r'\d+')! |
| 132 | +text := "10, 20, 30" |
| 133 | +
|
| 134 | +matches := r.find_all(text) |
| 135 | +for m in matches { |
| 136 | + println(m.text) |
| 137 | +} |
| 138 | +// Output: |
| 139 | +// 10 |
| 140 | +// 20 |
| 141 | +// 30 |
| 142 | +``` |
| 143 | + |
| 144 | +> **Note:** If a pattern matches an empty string (e.g., `a*` on `"b"`), the engine automatically |
| 145 | +advances the cursor by 1 to prevent infinite loops. |
| 146 | + |
| 147 | +--- |
| 148 | + |
| 149 | +### `find_from` |
| 150 | + |
| 151 | +Behaves like `find`, but starts scanning from a specific byte index. Useful for building lexers or |
| 152 | +parsing text iteratively. |
| 153 | + |
| 154 | +```v ignore |
| 155 | +fn (r Regex) find_from(text string, start_index int) ?Match |
| 156 | +``` |
| 157 | + |
| 158 | +**Example:** |
| 159 | +```v |
| 160 | +import regex.pcre |
| 161 | +
|
| 162 | +r := pcre.compile(r'test')! |
| 163 | +text := 'test test test' |
| 164 | +
|
| 165 | +// Skip the first 5 characters |
| 166 | +if m := r.find_from(text, 5) { |
| 167 | + println('Found at: ${m.start}') // Output: Found at: 5 |
| 168 | +} |
| 169 | +``` |
| 170 | + |
| 171 | +> **Note:** If `start_index` is out of bounds (< 0 or > len), it returns `none`. |
| 172 | +
|
| 173 | +--- |
| 174 | + |
| 175 | +### `fullmatch` |
| 176 | + |
| 177 | +Checks if the **entire** string matches the pattern from start to end. |
| 178 | + |
| 179 | +```v ignore |
| 180 | +fn (r Regex) fullmatch(text string) ?Match |
| 181 | +``` |
| 182 | + |
| 183 | +**Example:** |
| 184 | +```v ignore |
| 185 | +r := pcre.compile(r'\d{3}')! |
| 186 | +
|
| 187 | +println(r.fullmatch('123')) // Match |
| 188 | +println(r.fullmatch('1234')) // none (too long) |
| 189 | +println(r.fullmatch('a123')) // none (starts with char) |
| 190 | +``` |
| 191 | + |
| 192 | +--- |
| 193 | + |
| 194 | +### `replace` |
| 195 | + |
| 196 | +Finds the **first** occurrence of the pattern and replaces it with the replacement string. |
| 197 | + |
| 198 | +Supported backreferences: |
| 199 | +* `$1`, `$2`, etc. refer to captured groups. |
| 200 | +* `$0` is currently not supported. |
| 201 | + |
| 202 | +```v ignore |
| 203 | +fn (r Regex) replace(text string, repl string) string |
| 204 | +``` |
| 205 | + |
| 206 | +**Example:** |
| 207 | +```v |
| 208 | +import regex.pcre |
| 209 | +
|
| 210 | +r := pcre.compile(r'(\w+), (\w+)')! |
| 211 | +text := 'Doe, John' |
| 212 | +
|
| 213 | +// Swap groups |
| 214 | +result := r.replace(text, '$2 $1') |
| 215 | +println(result) // Output: "John Doe" |
| 216 | +``` |
| 217 | + |
| 218 | +> **Note:** This function currently replaces only the *first* match found. |
| 219 | +To replace all occurrences, |
| 220 | +you would need to loop using `replace` or reconstruct the string using `find_all` ranges. |
| 221 | + |
| 222 | +--- |
| 223 | + |
| 224 | +### `group_by_name` |
| 225 | + |
| 226 | +Retrieves the captured text for a specific named group defined with `(?P<name>...)`. |
| 227 | + |
| 228 | +```v ignore |
| 229 | +fn (r Regex) group_by_name(m Match, name string) string |
| 230 | +``` |
| 231 | + |
| 232 | +**Example:** |
| 233 | +```v ignore |
| 234 | +import regex.pcre |
| 235 | +
|
| 236 | +r := pcre.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')! |
| 237 | +m := r.find('Date: 2025-01') or {pcre.Match{}} |
| 238 | +
|
| 239 | +year := r.group_by_name(m, 'year') |
| 240 | +println(year) // Output: 2025 |
| 241 | +``` |
| 242 | + |
| 243 | +--- |
| 244 | + |
| 245 | +## Advanced Usage |
| 246 | + |
| 247 | +### VM Stability (No Stack Overflow) |
| 248 | +Because this engine uses a VM with a heap-allocated stack, it can handle patterns that typically |
| 249 | +crash recursive engines due to stack overflow. |
| 250 | + |
| 251 | +**Example:** |
| 252 | +```v |
| 253 | +import regex.pcre |
| 254 | +// A pattern that causes catastrophic backtracking in some recursive engines |
| 255 | +// or deep recursion depth. |
| 256 | +
|
| 257 | +r := pcre.compile(r'(a+)+b')! |
| 258 | +text := 'a'.repeat(5000) // Very long string of 'a's |
| 259 | +
|
| 260 | +// This will safely return 'none' without crashing the program |
| 261 | +r.find(text) |
| 262 | +``` |
| 263 | + |
| 264 | +### Using Flags |
| 265 | +Flags can be embedded to change matching behavior locally. |
| 266 | + |
| 267 | +**Example:** |
| 268 | +```v |
| 269 | +import regex.pcre |
| 270 | +// (?i) Case insensitive |
| 271 | +
|
| 272 | +r := pcre.compile(r'(?i)apple')! |
| 273 | +println(r.find('APPLE')) // Matches |
| 274 | +
|
| 275 | +// (?m) Multiline: ^ matches start of line, $ matches end of line |
| 276 | +r_multi := pcre.compile(r'(?m)^Log:')! |
| 277 | +text := 'Error: 1\nLog: Something happened' |
| 278 | +println(r_multi.find(text)) // Matches 'Log:' on the second line |
| 279 | +``` |
0 commit comments