Class: RE2::Scanner

Inherits:

Object

Object
RE2::Scanner

show all

Includes:: Enumerable

Defined in:: ext/re2/re2.cc,
lib/re2/scanner.rb

Instance Method Summary collapse

#each ⇒ Object
#eof? ⇒ Boolean
Returns whether the Scanner has consumed all input or not.
#initialize_copy(other) ⇒ Object
#regexp ⇒ RE2::Regexp
Returns the Regexp used in the Scanner.
#rewind ⇒ Object
Rewind the Scanner to the start of the string.
#scan ⇒ Array<String>, ...
Scan the given text incrementally for matches using `FindAndConsume`, returning an array of submatches on each subsequent call.
#string ⇒ String
Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.

Instance Method Details

#each ⇒ `Object`

# File 'lib/re2/scanner.rb', line 16

def each
  if block_given?
    while matches = scan
      yield matches
    end
  else
    to_enum(:each)
  end
end

#eof? ⇒ `Boolean`

Returns whether the RE2::Scanner has consumed all input or not.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.eof? #=> true

Returns:

(Boolean) —
whether the RE2::Scanner has consumed all input or not

# File 'ext/re2/re2.cc', line 492

static VALUE re2_scanner_eof(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return BOOL2RUBY(c->eof);
}

#initialize_copy(other) ⇒ `Object`

# File 'ext/re2/re2.cc', line 525

static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
  if (self == other) return self;

  re2_scanner *self_c;
  re2_scanner *other_c = unwrap_re2_scanner(other);

  TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);

  if (self_c->input) {
    delete self_c->input;
    self_c->input = nullptr;
  }

  RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
  RB_OBJ_WRITE(self, &self_c->text, other_c->text);
  self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
  self_c->eof = other_c->eof;

  if (other_c->input) {
    self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
    if (self_c->input == nullptr) {
      rb_raise(rb_eNoMemError,
               "not enough memory to allocate StringPiece for input");
    }
  } else {
    self_c->input = nullptr;
  }

  return self;
}

#regexp ⇒ `RE2::Regexp`

Returns the Regexp used in the RE2::Scanner.

Examples:

c = RE2::Regexp.new('(\d+)').scan("bob 123")
c.regexp #=> #<RE2::Regexp /(\d+)/>

Returns:

(RE2::Regexp) —
the regular expression used in the RE2::Scanner

# File 'ext/re2/re2.cc', line 884

static VALUE re2_scanner_regexp(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->regexp;
}

#rewind ⇒ `Object`

Rewind the RE2::Scanner to the start of the string.

Examples:

s = RE2::Regexp.new('(\d+)').scan("1 2 3")
e = s.to_enum
e.scan #=> ["1"]
e.scan #=> ["2"]
s.rewind
e.scan #=> ["1"]

# File 'ext/re2/re2.cc', line 509

static VALUE re2_scanner_rewind(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  delete c->input;
  c->input = new(std::nothrow) re2::StringPiece(
      RSTRING_PTR(c->text), RSTRING_LEN(c->text));
  if (c->input == nullptr) {
    rb_raise(rb_eNoMemError,
             "not enough memory to allocate StringPiece for input");
  }

  c->eof = false;

  return self;
}

#scan ⇒ `Array<String>`, ...

Scan the given text incrementally for matches using `FindAndConsume`, returning an array of submatches on each subsequent call. Returns nil if no matches are found or an empty array for every match if the pattern has no capturing groups.

Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the Regexp is set to false (any other encoding's behaviour is undefined).

Examples:

s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
s.scan #=> ["Foo"]
s.scan #=> ["bar"]

Returns:

(Array<String>) —
if the pattern has capturing groups
([]) —
if the pattern does not have capturing groups
(nil) —
if no matches are found

# File 'ext/re2/re2.cc', line 575

static VALUE re2_scanner_scan(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);
  re2_pattern *p = unwrap_re2_regexp(c->regexp);

  std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
  std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
  std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);

  if (c->eof) {
    return Qnil;
  }

  re2::StringPiece::size_type original_input_size = c->input->size();

  for (int i = 0; i < c->number_of_capturing_groups; ++i) {
    argv[i] = &matches[i];
    args[i] = &argv[i];
  }

  if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
        c->number_of_capturing_groups)) {
    re2::StringPiece::size_type new_input_size = c->input->size();
    bool input_advanced = new_input_size < original_input_size;

    VALUE result = rb_ary_new2(c->number_of_capturing_groups);

    for (int i = 0; i < c->number_of_capturing_groups; ++i) {
      if (matches[i].data() == nullptr) {
        rb_ary_push(result, Qnil);
      } else {
        rb_ary_push(result, encoded_str_new(matches[i].data(),
              matches[i].size(),
              p->pattern->options().encoding()));
      }
    }

    /* Check whether we've exhausted the input yet. */
    c->eof = new_input_size == 0;

    /* If the match didn't advance the input, we need to do this ourselves,
     * advancing by a whole character to avoid splitting multi-byte characters.
     *
     * The lookup table approach is taken from RE2's own Python extension: the
     * high 4 bits of a UTF-8 lead byte determine the character's byte length.
     *
     * See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
     */
    if (!input_advanced && new_input_size > 0) {
      size_t char_size = 1;

      if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
        char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
            [((*c->input)[0] & 0xFF) >> 4];

        if (char_size > new_input_size) {
          char_size = new_input_size;
        }
      }

      c->input->remove_prefix(char_size);
    }

    return result;
  } else {
    return Qnil;
  }
}

#string ⇒ `String`

Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.

If the text was already a frozen string, returns the original.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.string #=> "foo"

Returns:

(String) —
a frozen string with the text passed to Regexp#scan

# File 'ext/re2/re2.cc', line 478

static VALUE re2_scanner_string(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->text;
}

Class: RE2::Scanner

Instance Method Summary collapse

Instance Method Details

#each ⇒ Object

#eof? ⇒ Boolean

Examples:

#initialize_copy(other) ⇒ Object

#regexp ⇒ RE2::Regexp

Examples:

#rewind ⇒ Object

Examples:

#scan ⇒ Array<String>, ...

Examples:

#string ⇒ String

Examples:

#each ⇒ `Object`

#eof? ⇒ `Boolean`

#initialize_copy(other) ⇒ `Object`

#regexp ⇒ `RE2::Regexp`

#rewind ⇒ `Object`

#scan ⇒ `Array<String>`, ...

#string ⇒ `String`