[go: nahoru, domu]

Skip to content

Commit

Permalink
Named group support.
Browse files Browse the repository at this point in the history
This changes the parser to annotate the returned Regexp with information
about named capture groups. During compliation, this information is
passed along to the RE2 instance and thence to the Matcher.

The parser will throw PatternSyntaxException if duplicate group names
are specified.
  • Loading branch information
sjamesr committed Apr 2, 2019
1 parent fc83bf7 commit d0ec5a7
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 0 deletions.
49 changes: 49 additions & 0 deletions java/com/google/re2j/Matcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

package com.google.re2j;

import java.util.Map;

/**
* A stateful iterator that interprets a regex {@code Pattern} on a specific input. Its interface
* mimics the JDK 1.4.2 {@code java.util.regex.Matcher}.
Expand Down Expand Up @@ -37,6 +39,8 @@ public final class Matcher {
// The group indexes, in [start, end) pairs. Zeroth pair is overall match.
private final int[] groups;

private final Map<String, Integer> namedGroups;

// The number of submatches (groups) in the pattern.
private final int groupCount;

Expand Down Expand Up @@ -66,6 +70,7 @@ private Matcher(Pattern pattern) {
RE2 re2 = pattern.re2();
groupCount = re2.numberOfCapturingGroups();
groups = new int[2 + 2 * groupCount];
namedGroups = re2.namedGroups;
}

/** Creates a new {@code Matcher} with the given pattern and input. */
Expand Down Expand Up @@ -137,6 +142,21 @@ public int start(int group) {
return groups[2 * group];
}

/**
* Returns the start of the named group of the most recent match, or -1 if the group was not
* matched.
*
* @param group the group name
* @throws IllegalArgumentException if no group with that name exists
*/
public int start(String group) {
Integer g = namedGroups.get(group);
if (g == null) {
throw new IllegalArgumentException("group '" + group + "' not found");
}
return start(g);
}

/**
* Returns the end position of a subgroup of the most recent match.
*
Expand All @@ -149,6 +169,21 @@ public int end(int group) {
return groups[2 * group + 1];
}

/**
* Returns the end of the named group of the most recent match, or -1 if the group was not
* matched.
*
* @param group the group name
* @throws IllegalArgumentException if no group with that name exists
*/
public int end(String group) {
Integer g = namedGroups.get(group);
if (g == null) {
throw new IllegalArgumentException("group '" + group + "' not found");
}
return end(g);
}

/**
* Returns the most recent match.
*
Expand All @@ -174,6 +209,20 @@ public String group(int group) {
return substring(start, end);
}

/**
* Returns the named group of the most recent match, or {@code null} if the group was not matched.
*
* @param group the group name
* @throws IllegalArgumentException if no group with that name exists
*/
public String group(String group) {
Integer g = namedGroups.get(group);
if (g == null) {
throw new IllegalArgumentException("group '" + group + "' not found");
}
return group(g);
}

/**
* Returns the number of subgroups in this pattern.
*
Expand Down
8 changes: 8 additions & 0 deletions java/com/google/re2j/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* A parser of regular expression patterns.
Expand All @@ -38,6 +40,7 @@ class Parser {
private static final String ERR_MISSING_REPEAT_ARGUMENT =
"missing argument to repetition operator";
private static final String ERR_TRAILING_BACKSLASH = "trailing backslash at end of expression";
private static final String ERR_DUPLICATE_NAMED_CAPTURE = "duplicate capture group name";

// Hack to expose ArrayList.removeRange().
private static class Stack extends ArrayList<Regexp> {
Expand All @@ -56,6 +59,7 @@ public void removeRange(int fromIndex, int toIndex) {
private final Stack stack = new Stack();
private Regexp free;
private int numCap = 0; // number of capturing groups seen
private Map<String, Integer> namedGroups = new HashMap<String, Integer>();

Parser(String wholeRegexp, int flags) {
this.wholeRegexp = wholeRegexp;
Expand Down Expand Up @@ -972,6 +976,7 @@ private Regexp parseInternal() throws PatternSyntaxException {
if (n != 1) {
throw new PatternSyntaxException(ERR_MISSING_PAREN, wholeRegexp);
}
stack.get(0).namedGroups = namedGroups;
return stack.get(0);
}

Expand Down Expand Up @@ -1062,6 +1067,9 @@ private void parsePerlFlags(StringIterator t) throws PatternSyntaxException {
// Like ordinary capture, but named.
Regexp re = op(Regexp.Op.LEFT_PAREN);
re.cap = ++numCap;
if (namedGroups.put(name, numCap) != null) {
throw new PatternSyntaxException(ERR_DUPLICATE_NAMED_CAPTURE, name);
}
re.name = name;
return;
}
Expand Down
3 changes: 3 additions & 0 deletions java/com/google/re2j/RE2.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Queue;

/**
Expand Down Expand Up @@ -116,6 +117,7 @@ class RE2 {
// Accesses must be serialized using |this| monitor.
// @GuardedBy("this")
private final Queue<Machine> machine = new ArrayDeque<Machine>();
public Map<String, Integer> namedGroups;

// This is visible for testing.
RE2(String expr) {
Expand Down Expand Up @@ -195,6 +197,7 @@ static RE2 compileImpl(String expr, int mode, boolean longest) throws PatternSyn
if (!re2.prefix.isEmpty()) {
re2.prefixRune = re2.prefix.codePointAt(0);
}
re2.namedGroups = re.namedGroups;
return re2;
}

Expand Down
3 changes: 3 additions & 0 deletions java/com/google/re2j/Regexp.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
package com.google.re2j;

import java.util.Arrays;
import java.util.Map;

/**
* Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
Expand Down Expand Up @@ -56,6 +57,7 @@ boolean isPseudo() {
int min, max; // min, max for REPEAT
int cap; // capturing index, for CAPTURE
String name; // capturing name, for CAPTURE
Map<String, Integer> namedGroups; // map of group name -> capturing index
// Do update copy ctor when adding new fields!

Regexp(Op op) {
Expand All @@ -72,6 +74,7 @@ boolean isPseudo() {
this.max = that.max;
this.cap = that.cap;
this.name = that.name;
this.namedGroups = that.namedGroups;
}

void reinit() {
Expand Down
30 changes: 30 additions & 0 deletions javatests/com/google/re2j/MatcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -431,4 +431,34 @@ public void testMutableCharSequence() {
b.replace(b.indexOf("ban"), start + 3, "b");
assertTrue(m.find(b.indexOf("ban")));
}

@Test
public void testNamedGroups() {
Pattern p =
Pattern.compile(
"(?P<baz>f(?P<foo>b*a(?P<another>r+)){0,10})" + "(?P<bag>bag)?(?P<nomatch>zzz)?");
Matcher m = p.matcher("fbbarrrrrbag");
assertTrue(m.matches());
assertEquals("fbbarrrrr", m.group("baz"));
assertEquals("bbarrrrr", m.group("foo"));
assertEquals("rrrrr", m.group("another"));
assertEquals(0, m.start("baz"));
assertEquals(1, m.start("foo"));
assertEquals(4, m.start("another"));
assertEquals(9, m.end("baz"));
assertEquals(9, m.end("foo"));
assertEquals("bag", m.group("bag"));
assertEquals(9, m.start("bag"));
assertEquals(12, m.end("bag"));
assertEquals(null, m.group("nomatch"));
assertEquals(-1, m.start("nomatch"));
assertEquals(-1, m.end("nomatch"));

try {
m.group("nonexistent");
fail("Should have thrown IllegalArgumentException");
} catch (IllegalArgumentException expected) {
// Expected
}
}
}
2 changes: 2 additions & 0 deletions javatests/com/google/re2j/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,8 @@ private static String runesToString(int[] runes) {
"(?i)[a-Z]",
"a{100000}",
"a{100000,}",
// Group names may not be repeated
"(?P<foo>bar)(?P<foo>baz)",
};

private static final String[] ONLY_PERL = {
Expand Down

0 comments on commit d0ec5a7

Please sign in to comment.