Class: RE2::Scanner

Inherits:
Object show all
Includes:
Enumerable
Defined in:
ext/re2/re2.cc,
lib/re2/scanner.rb

Instance Method Summary collapse

Instance Method Details

#eachObject



16
17
18
19
20
21
22
23
24
# File 'lib/re2/scanner.rb', line 16

def each
  if block_given?
    while matches = scan
      yield matches
    end
  else
    to_enum(:each)
  end
end

#eof?Boolean

Returns whether the RE2::Scanner has consumed all input or not.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.eof? #=> true

Returns:

  • (Boolean)

    whether the RE2::Scanner has consumed all input or not



492
493
494
495
496
# File 'ext/re2/re2.cc', line 492

static VALUE re2_scanner_eof(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return BOOL2RUBY(c->eof);
}

#initialize_copy(other) ⇒ Object



525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
# File 'ext/re2/re2.cc', line 525

static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
  if (self == other) return self;

  re2_scanner *self_c;
  re2_scanner *other_c = unwrap_re2_scanner(other);

  TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);

  if (self_c->input) {
    delete self_c->input;
    self_c->input = nullptr;
  }

  RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
  RB_OBJ_WRITE(self, &self_c->text, other_c->text);
  self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
  self_c->eof = other_c->eof;

  if (other_c->input) {
    self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
    if (self_c->input == nullptr) {
      rb_raise(rb_eNoMemError,
               "not enough memory to allocate StringPiece for input");
    }
  } else {
    self_c->input = nullptr;
  }

  return self;
}

#regexpRE2::Regexp

Returns the Regexp used in the RE2::Scanner.

Examples:

c = RE2::Regexp.new('(\d+)').scan("bob 123")
c.regexp #=> #<RE2::Regexp /(\d+)/>

Returns:



884
885
886
887
888
# File 'ext/re2/re2.cc', line 884

static VALUE re2_scanner_regexp(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->regexp;
}

#rewindObject

Rewind the RE2::Scanner to the start of the string.

Examples:

s = RE2::Regexp.new('(\d+)').scan("1 2 3")
e = s.to_enum
e.scan #=> ["1"]
e.scan #=> ["2"]
s.rewind
e.scan #=> ["1"]


509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
# File 'ext/re2/re2.cc', line 509

static VALUE re2_scanner_rewind(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  delete c->input;
  c->input = new(std::nothrow) re2::StringPiece(
      RSTRING_PTR(c->text), RSTRING_LEN(c->text));
  if (c->input == nullptr) {
    rb_raise(rb_eNoMemError,
             "not enough memory to allocate StringPiece for input");
  }

  c->eof = false;

  return self;
}

#scanArray<String>, ...

Scan the given text incrementally for matches using `FindAndConsume`, returning an array of submatches on each subsequent call. Returns nil if no matches are found or an empty array for every match if the pattern has no capturing groups.

Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the Regexp is set to false (any other encoding's behaviour is undefined).

Examples:

s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
s.scan #=> ["Foo"]
s.scan #=> ["bar"]

Returns:

  • (Array<String>)

    if the pattern has capturing groups

  • ([])

    if the pattern does not have capturing groups

  • (nil)

    if no matches are found



575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
# File 'ext/re2/re2.cc', line 575

static VALUE re2_scanner_scan(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);
  re2_pattern *p = unwrap_re2_regexp(c->regexp);

  std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
  std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
  std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);

  if (c->eof) {
    return Qnil;
  }

  re2::StringPiece::size_type original_input_size = c->input->size();

  for (int i = 0; i < c->number_of_capturing_groups; ++i) {
    argv[i] = &matches[i];
    args[i] = &argv[i];
  }

  if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
        c->number_of_capturing_groups)) {
    re2::StringPiece::size_type new_input_size = c->input->size();
    bool input_advanced = new_input_size < original_input_size;

    VALUE result = rb_ary_new2(c->number_of_capturing_groups);

    for (int i = 0; i < c->number_of_capturing_groups; ++i) {
      if (matches[i].data() == nullptr) {
        rb_ary_push(result, Qnil);
      } else {
        rb_ary_push(result, encoded_str_new(matches[i].data(),
              matches[i].size(),
              p->pattern->options().encoding()));
      }
    }

    /* Check whether we've exhausted the input yet. */
    c->eof = new_input_size == 0;

    /* If the match didn't advance the input, we need to do this ourselves,
     * advancing by a whole character to avoid splitting multi-byte characters.
     *
     * The lookup table approach is taken from RE2's own Python extension: the
     * high 4 bits of a UTF-8 lead byte determine the character's byte length.
     *
     * See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
     */
    if (!input_advanced && new_input_size > 0) {
      size_t char_size = 1;

      if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
        char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
            [((*c->input)[0] & 0xFF) >> 4];

        if (char_size > new_input_size) {
          char_size = new_input_size;
        }
      }

      c->input->remove_prefix(char_size);
    }

    return result;
  } else {
    return Qnil;
  }
}

#stringString

Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.

If the text was already a frozen string, returns the original.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.string #=> "foo"

Returns:



478
479
480
481
482
# File 'ext/re2/re2.cc', line 478

static VALUE re2_scanner_string(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->text;
}