Class: RE2::Scanner
- Includes:
- Enumerable
- Defined in:
- ext/re2/re2.cc,
lib/re2/scanner.rb
Instance Method Summary collapse
- #each ⇒ Object
-
#eof? ⇒ Boolean
Returns whether the Scanner has consumed all input or not.
- #initialize_copy(other) ⇒ Object
- #regexp ⇒ RE2::Regexp
-
#rewind ⇒ Object
Rewind the Scanner to the start of the string.
-
#scan ⇒ Array<String>, ...
Scan the given text incrementally for matches using `FindAndConsume`, returning an array of submatches on each subsequent call.
-
#string ⇒ String
Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.
Instance Method Details
#each ⇒ Object
16 17 18 19 20 21 22 23 24 |
# File 'lib/re2/scanner.rb', line 16 def each if block_given? while matches = scan yield matches end else to_enum(:each) end end |
#eof? ⇒ Boolean
Returns whether the RE2::Scanner has consumed all input or not.
492 493 494 495 496 |
# File 'ext/re2/re2.cc', line 492
static VALUE re2_scanner_eof(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return BOOL2RUBY(c->eof);
}
|
#initialize_copy(other) ⇒ Object
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 |
# File 'ext/re2/re2.cc', line 525
static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
if (self == other) return self;
re2_scanner *self_c;
re2_scanner *other_c = unwrap_re2_scanner(other);
TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);
if (self_c->input) {
delete self_c->input;
self_c->input = nullptr;
}
RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
RB_OBJ_WRITE(self, &self_c->text, other_c->text);
self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
self_c->eof = other_c->eof;
if (other_c->input) {
self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
if (self_c->input == nullptr) {
rb_raise(rb_eNoMemError,
"not enough memory to allocate StringPiece for input");
}
} else {
self_c->input = nullptr;
}
return self;
}
|
#regexp ⇒ RE2::Regexp
Returns the Regexp used in the RE2::Scanner.
884 885 886 887 888 |
# File 'ext/re2/re2.cc', line 884
static VALUE re2_scanner_regexp(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return c->regexp;
}
|
#rewind ⇒ Object
Rewind the RE2::Scanner to the start of the string.
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 |
# File 'ext/re2/re2.cc', line 509
static VALUE re2_scanner_rewind(VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
delete c->input;
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(c->text), RSTRING_LEN(c->text));
if (c->input == nullptr) {
rb_raise(rb_eNoMemError,
"not enough memory to allocate StringPiece for input");
}
c->eof = false;
return self;
}
|
#scan ⇒ Array<String>, ...
Scan the given text incrementally for matches using
`FindAndConsume`, returning an array of submatches on each subsequent
call. Returns nil if no matches are found or an empty array for every
match if the pattern has no capturing groups.
Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
Regexp is set to false (any other encoding's behaviour is undefined).
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 |
# File 'ext/re2/re2.cc', line 575
static VALUE re2_scanner_scan(VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
re2_pattern *p = unwrap_re2_regexp(c->regexp);
std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);
if (c->eof) {
return Qnil;
}
re2::StringPiece::size_type original_input_size = c->input->size();
for (int i = 0; i < c->number_of_capturing_groups; ++i) {
argv[i] = &matches[i];
args[i] = &argv[i];
}
if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
c->number_of_capturing_groups)) {
re2::StringPiece::size_type new_input_size = c->input->size();
bool input_advanced = new_input_size < original_input_size;
VALUE result = rb_ary_new2(c->number_of_capturing_groups);
for (int i = 0; i < c->number_of_capturing_groups; ++i) {
if (matches[i].data() == nullptr) {
rb_ary_push(result, Qnil);
} else {
rb_ary_push(result, encoded_str_new(matches[i].data(),
matches[i].size(),
p->pattern->options().encoding()));
}
}
/* Check whether we've exhausted the input yet. */
c->eof = new_input_size == 0;
/* If the match didn't advance the input, we need to do this ourselves,
* advancing by a whole character to avoid splitting multi-byte characters.
*
* The lookup table approach is taken from RE2's own Python extension: the
* high 4 bits of a UTF-8 lead byte determine the character's byte length.
*
* See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
*/
if (!input_advanced && new_input_size > 0) {
size_t char_size = 1;
if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
[((*c->input)[0] & 0xFF) >> 4];
if (char_size > new_input_size) {
char_size = new_input_size;
}
}
c->input->remove_prefix(char_size);
}
return result;
} else {
return Qnil;
}
}
|
#string ⇒ String
Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.
If the text was already a frozen string, returns the original.
478 479 480 481 482 |
# File 'ext/re2/re2.cc', line 478
static VALUE re2_scanner_string(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return c->text;
}
|