From 84027b880b074ade59d1b39f97bc7c0c4ca6a569 Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 8 Feb 2021 10:35:08 +0000 Subject: [PATCH] fix: capture groups with quantifiers are not repeated fixes: 31 --- assembly/__spec_tests__/generated.spec.ts | 129 ++++++++++++++++++---- assembly/__tests__/capture-group.spec.ts | 6 + assembly/nfa/nfa.ts | 8 +- assembly/parser/node.ts | 9 +- assembly/parser/walker.ts | 7 +- assembly/regexp.ts | 35 +++++- spec/test-generator.js | 13 --- ts/index.ts | 9 +- 8 files changed, 170 insertions(+), 46 deletions(-) diff --git a/assembly/__spec_tests__/generated.spec.ts b/assembly/__spec_tests__/generated.spec.ts index 537c422..95c6422 100644 --- a/assembly/__spec_tests__/generated.spec.ts +++ b/assembly/__spec_tests__/generated.spec.ts @@ -373,17 +373,41 @@ it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => { expect(match.matches[0]).toBe("bc".substring(0, 2)); expect(match.matches[1]).toBe("bc".substring(0, 1)); }); -xit("line: 52 - issues with repeated capture groups", () => {}); -xit("line: 53 - issues with repeated capture groups", () => {}); -xit("line: 54 - issues with repeated capture groups", () => {}); -xit("line: 55 - issues with repeated capture groups", () => {}); +it("line: 52 - matches ^(b+?|a){1,2}?c against 'bbc'", () => { + const match = exec("^(b+?|a){1,2}?c", "bbc", "s"); + expect(match.matches[0]).toBe("bbc".substring(0, 3)); + expect(match.matches[1]).toBe("bbc".substring(1, 2)); +}); +it("line: 53 - matches ^(b+?|a){1,2}?c against 'bbbc'", () => { + const match = exec("^(b+?|a){1,2}?c", "bbbc", "s"); + expect(match.matches[0]).toBe("bbbc".substring(0, 4)); + expect(match.matches[1]).toBe("bbbc".substring(1, 3)); +}); +it("line: 54 - matches ^(b+?|a){1,2}?c against 'bac'", () => { + const match = exec("^(b+?|a){1,2}?c", "bac", "s"); + expect(match.matches[0]).toBe("bac".substring(0, 3)); + expect(match.matches[1]).toBe("bac".substring(1, 2)); +}); +it("line: 55 - matches ^(b+?|a){1,2}?c against 'bbac'", () => { + const match = exec("^(b+?|a){1,2}?c", "bbac", "s"); + expect(match.matches[0]).toBe("bbac".substring(0, 4)); + expect(match.matches[1]).toBe("bbac".substring(2, 3)); +}); it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => { const match = exec("^(b+?|a){1,2}?c", "aac", "s"); expect(match.matches[0]).toBe("aac".substring(0, 3)); expect(match.matches[1]).toBe("aac".substring(1, 2)); }); -xit("line: 57 - issues with repeated capture groups", () => {}); -xit("line: 58 - issues with repeated capture groups", () => {}); +it("line: 57 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbc'", () => { + const match = exec("^(b+?|a){1,2}?c", "abbbbbbbbbbbc", "s"); + expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13)); + expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12)); +}); +it("line: 58 - matches ^(b+?|a){1,2}?c against 'bbbbbbbbbbbac'", () => { + const match = exec("^(b+?|a){1,2}?c", "bbbbbbbbbbbac", "s"); + expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13)); + expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12)); +}); it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => { expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]); }); @@ -400,12 +424,36 @@ it("line: 62 - matches ^(b+|a){1,2}c against 'bbc'", () => { expect(match.matches[0]).toBe("bbc".substring(0, 3)); expect(match.matches[1]).toBe("bbc".substring(0, 2)); }); -xit("line: 63 - issues with repeated capture groups", () => {}); -xit("line: 64 - issues with repeated capture groups", () => {}); -xit("line: 65 - issues with repeated capture groups", () => {}); -xit("line: 66 - issues with repeated capture groups", () => {}); -xit("line: 67 - issues with repeated capture groups", () => {}); -xit("line: 68 - issues with repeated capture groups", () => {}); +it("line: 63 - matches ^(b+|a){1,2}c against 'bbbc'", () => { + const match = exec("^(b+|a){1,2}c", "bbbc", "s"); + expect(match.matches[0]).toBe("bbbc".substring(0, 4)); + expect(match.matches[1]).toBe("bbbc".substring(0, 3)); +}); +it("line: 64 - matches ^(b+|a){1,2}c against 'bac'", () => { + const match = exec("^(b+|a){1,2}c", "bac", "s"); + expect(match.matches[0]).toBe("bac".substring(0, 3)); + expect(match.matches[1]).toBe("bac".substring(1, 2)); +}); +it("line: 65 - matches ^(b+|a){1,2}c against 'bbac'", () => { + const match = exec("^(b+|a){1,2}c", "bbac", "s"); + expect(match.matches[0]).toBe("bbac".substring(0, 4)); + expect(match.matches[1]).toBe("bbac".substring(2, 3)); +}); +it("line: 66 - matches ^(b+|a){1,2}c against 'aac'", () => { + const match = exec("^(b+|a){1,2}c", "aac", "s"); + expect(match.matches[0]).toBe("aac".substring(0, 3)); + expect(match.matches[1]).toBe("aac".substring(1, 2)); +}); +it("line: 67 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbc'", () => { + const match = exec("^(b+|a){1,2}c", "abbbbbbbbbbbc", "s"); + expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13)); + expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12)); +}); +it("line: 68 - matches ^(b+|a){1,2}c against 'bbbbbbbbbbbac'", () => { + const match = exec("^(b+|a){1,2}c", "bbbbbbbbbbbac", "s"); + expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13)); + expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12)); +}); it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => { expectNotMatch("^(b+|a){1,2}c", ["aaac"]); }); @@ -417,8 +465,16 @@ it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => { expect(match.matches[0]).toBe("bbc".substring(0, 3)); expect(match.matches[1]).toBe("bbc".substring(0, 1)); }); -xit("line: 72 - issues with repeated capture groups", () => {}); -xit("line: 73 - issues with repeated capture groups", () => {}); +it("line: 72 - matches ^(b*|ba){1,2}?bc against 'babc'", () => { + const match = exec("^(b*|ba){1,2}?bc", "babc", "s"); + expect(match.matches[0]).toBe("babc".substring(0, 4)); + expect(match.matches[1]).toBe("babc".substring(0, 2)); +}); +it("line: 73 - matches ^(b*|ba){1,2}?bc against 'bbabc'", () => { + const match = exec("^(b*|ba){1,2}?bc", "bbabc", "s"); + expect(match.matches[0]).toBe("bbabc".substring(0, 5)); + expect(match.matches[1]).toBe("bbabc".substring(1, 3)); +}); it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => { const match = exec("^(b*|ba){1,2}?bc", "bababc", "s"); expect(match.matches[0]).toBe("bababc".substring(0, 6)); @@ -435,7 +491,11 @@ it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => { expect(match.matches[0]).toBe("babc".substring(0, 4)); expect(match.matches[1]).toBe("babc".substring(0, 2)); }); -xit("line: 78 - issues with repeated capture groups", () => {}); +it("line: 78 - matches ^(ba|b*){1,2}?bc against 'bbabc'", () => { + const match = exec("^(ba|b*){1,2}?bc", "bbabc", "s"); + expect(match.matches[0]).toBe("bbabc".substring(0, 5)); + expect(match.matches[1]).toBe("bbabc".substring(1, 3)); +}); it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => { const match = exec("^(ba|b*){1,2}?bc", "bababc", "s"); expect(match.matches[0]).toBe("bababc".substring(0, 6)); @@ -1199,8 +1259,32 @@ it("line: 261 - matches ^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA- "From abcd Mon Sep 01 12:33:02 1997".substring(5, 9) ); }); -xit("line: 262 - issues with repeated capture groups", () => {}); -xit("line: 263 - issues with repeated capture groups", () => {}); +it("line: 262 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 01 12:33:02 1997'", () => { + const match = exec( + "^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d", + "From abcd Mon Sep 01 12:33:02 1997", + "s" + ); + expect(match.matches[0]).toBe( + "From abcd Mon Sep 01 12:33:02 1997".substring(0, 27) + ); + expect(match.matches[1]).toBe( + "From abcd Mon Sep 01 12:33:02 1997".substring(15, 19) + ); +}); +it("line: 263 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 1 12:33:02 1997'", () => { + const match = exec( + "^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d", + "From abcd Mon Sep 1 12:33:02 1997", + "s" + ); + expect(match.matches[0]).toBe( + "From abcd Mon Sep 1 12:33:02 1997".substring(0, 27) + ); + expect(match.matches[1]).toBe( + "From abcd Mon Sep 1 12:33:02 1997".substring(15, 20) + ); +}); it("line: 264 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Sep 01 12:33:02 1997'", () => { expectNotMatch( "^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d", @@ -2089,8 +2173,15 @@ it("line: 1390 - matches ^[abc]{12} against 'abcabcabcabc'", () => { const match = exec("^[abc]{12}", "abcabcabcabc", "s"); expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12)); }); -xit("line: 1391 - issues with repeated capture groups", () => {}); -xit("line: 1392 - issues with repeated capture groups", () => {}); +it("line: 1391 - matches ^[a-c]{12} against 'abcabcabcabc'", () => { + const match = exec("^[a-c]{12}", "abcabcabcabc", "s"); + expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12)); +}); +it("line: 1392 - matches ^(a|b|c){12} against 'abcabcabcabc '", () => { + const match = exec("^(a|b|c){12}", "abcabcabcabc ", "s"); + expect(match.matches[0]).toBe("abcabcabcabc ".substring(0, 12)); + expect(match.matches[1]).toBe("abcabcabcabc ".substring(11, 12)); +}); it("line: 1393 - matches ^[abcdefghijklmnopqrstuvwxy0123456789] against 'n'", () => { const match = exec("^[abcdefghijklmnopqrstuvwxy0123456789]", "n", "s"); expect(match.matches[0]).toBe("n".substring(0, 1)); diff --git a/assembly/__tests__/capture-group.spec.ts b/assembly/__tests__/capture-group.spec.ts index ee4fe4a..ad11ccf 100644 --- a/assembly/__tests__/capture-group.spec.ts +++ b/assembly/__tests__/capture-group.spec.ts @@ -39,3 +39,9 @@ it("repeated capture groups should return the last match", () => { expect(match.matches[0]).toBe("ac"); expect(match.matches[1]).toBe("c"); }); + +it("range repitition capture groups should return the last match", () => { + const match = exec("([a-c]){2}", "ac"); + expect(match.matches[0]).toBe("ac"); + expect(match.matches[1]).toBe("c"); +}); diff --git a/assembly/nfa/nfa.ts b/assembly/nfa/nfa.ts index dda5912..ee12e83 100644 --- a/assembly/nfa/nfa.ts +++ b/assembly/nfa/nfa.ts @@ -40,7 +40,7 @@ export class GroupStartMarkerState extends State { // captures from the path through the NFA that reaches the end are flagged flagged: bool = false; - constructor(next: State) { + constructor(next: State, public id: i32) { super(); this.transitions.push(next); } @@ -162,10 +162,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata { return new Automata(start, end); } -function group(nfa: Automata): Automata { +function group(nfa: Automata, id: i32): Automata { // groups are implemented by wrapping the automata with // a pair of markers that record matches - const startMarker = new GroupStartMarkerState(nfa.start); + const startMarker = new GroupStartMarkerState(nfa.start, id); const end = new State(); const endMarker = new GroupEndMarkerState(end, startMarker); nfa.end.transitions.push(endMarker); @@ -236,7 +236,7 @@ class AutomataFactor { ); case NodeType.Group: { const node = expression as GroupNode; - return group(this.automataForNode(node.expression)); + return group(this.automataForNode(node.expression), node.id); } case NodeType.Assertion: return Automata.fromEpsilon(); diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts index a6af920..ec8524b 100644 --- a/assembly/parser/node.ts +++ b/assembly/parser/node.ts @@ -206,9 +206,14 @@ export class AlternationNode extends Node { } } +let _id = 0; + export class GroupNode extends Node { - constructor(public expression: Node) { + constructor(public expression: Node, public id: i32 = -1) { super(NodeType.Group); + if (id == -1) { + this.id = _id++; + } } children(): Node[] { @@ -216,7 +221,7 @@ export class GroupNode extends Node { } clone(): Node { - return new GroupNode(this.expression.clone()); + return new GroupNode(this.expression.clone(), this.id); } replace(node: Node, replacement: Node): void { diff --git a/assembly/parser/walker.ts b/assembly/parser/walker.ts index 42a2242..88ae4ed 100644 --- a/assembly/parser/walker.ts +++ b/assembly/parser/walker.ts @@ -75,8 +75,11 @@ export function expandRepetitions(visitor: NodeVisitor): void { // create multiple clones const clones = new Array(from); // a{4} => aaaa - for (let i = 0; i < from; i++) { - clones[i] = expression.clone(); + if (from > 0) { + clones[0] = expression; + for (let i = 1; i < from; i++) { + clones[i] = expression.clone(); + } } if (rangeRepNode.to == -1) { diff --git a/assembly/regexp.ts b/assembly/regexp.ts index 6ba2f61..e180bc8 100644 --- a/assembly/regexp.ts +++ b/assembly/regexp.ts @@ -83,6 +83,30 @@ export class Flags { } } +// capture groups are implemented as GroupStart / GroupEnd states that record (capture) +// the value of the current state of the string being matched. +// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The +// returned regex should only return the value of the final repetition. +function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] { + if (!groupMarkers.length) { + return []; + } + const values = [first(groupMarkers).capture]; + let currrentId = first(groupMarkers).id; + for (let i = 0; i < groupMarkers.length; i++) { + const gm = groupMarkers[i]; + if (gm.id != currrentId) { + currrentId = gm.id; + values.push(gm.capture); + } else { + if (gm.flagged) { + values[values.length - 1] = gm.capture; + } + } + } + return values; +} + export class RegExp { lastIndex: i32 = 0; private flags: Flags; @@ -143,15 +167,20 @@ export class RegExp { this.nfa.start, str.substr(matchIndex) ); + // we have found a match if (matchStr != null) { + // remove any non-flagged captures + groupMarkers.forEach((gm) => { + gm.capture = gm.flagged ? gm.capture : ""; + }); + const match = new Match( - [matchStr!].concat( - groupMarkers.map((m) => (m.flagged ? m.capture : "")) - ), + [matchStr!].concat(filterCaptures(groupMarkers)), matchIndex, str ); + // return this match (checking end of input condition) const matchEndIndex = match.index + match.matches[0].length; if (!this.endOfInput || (this.endOfInput && matchEndIndex == len)) { diff --git a/spec/test-generator.js b/spec/test-generator.js index aaee983..3bcd45a 100644 --- a/spec/test-generator.js +++ b/spec/test-generator.js @@ -16,19 +16,6 @@ const knownIssues = { ...range(487, 494), ...range(1077, 1082), ], - "issues with repeated capture groups": [ - 262, - 263, - ...range(63, 68), - 1391, - 1392, - ...range(52, 55), - 57, - 58, - 72, - 73, - 78, - ], "lazy quantifiers should still yield the longest overall regex match": [ ...range(141, 143), 1288, diff --git a/ts/index.ts b/ts/index.ts index 55517e6..56f58b6 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,10 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("ba{0}b"); -const match = regexObj.exec("bb"); +const regexObj = new RegExp("^(a){1,3}"); +const match = regexObj.exec("abc"); +console.log(JSON.stringify(match, null, 2)); -console.log(match); +const regexObj2 = new RegExp("(a|b)c|a(b|c)"); +const match2 = regexObj2.exec("ab"); +console.log(JSON.stringify(match2, null, 2));