From 5216524729fddd6f1b2a6874c939f507eca2b58f Mon Sep 17 00:00:00 2001 From: Colin E Date: Wed, 24 Feb 2021 07:21:00 +0000 Subject: [PATCH 1/3] chore: character class <=> character set fixes #9 --- README.md | 2 +- assembly/__tests__/character-classes.spec.ts | 73 ++++++++++---------- assembly/__tests__/character-sets.spec.ts | 73 ++++++++++---------- assembly/nfa/matcher.ts | 36 +++++----- assembly/nfa/nfa.ts | 10 +-- assembly/parser/node.ts | 14 ++-- assembly/parser/parser.ts | 16 ++--- 7 files changed, 112 insertions(+), 112 deletions(-) diff --git a/README.md b/README.md index 2ee6143..477c678 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ The next phase of development will focussed on more extensive testing and perfor Based on the classfication within the [MDN cheatsheet](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet) -**Character classes** +**Character sets** - [x] . - [x] \d diff --git a/assembly/__tests__/character-classes.spec.ts b/assembly/__tests__/character-classes.spec.ts index 553ac7f..aeb7cf1 100644 --- a/assembly/__tests__/character-classes.spec.ts +++ b/assembly/__tests__/character-classes.spec.ts @@ -1,55 +1,54 @@ -import { RegExp } from ".."; -import { expectMatch, expectNotMatch, exec } from "./utils"; +import { expectMatch, expectNotMatch } from "./utils"; -it("dot", () => { - expectMatch(".", [" ", "B", "|", "9"]); - expectNotMatch(".", ["", "\n"]); +it("throws an error if no closing bracket is found", () => { + // expect(() => new RegExp("[abce")).toThrow(); }); -it("digit", () => { - expectMatch("\\d", ["0", "9"]); - expectNotMatch("\\d", ["", "b"]); +it("matches discrete characters", () => { + expectMatch("[abce]", ["a", "b", "c", "e"]); + expectNotMatch("[abce]", ["", "f", "h"]); }); -it("non-digit", () => { - expectNotMatch("\\D", ["0", "9", ""]); - expectMatch("\\D", ["b", "|"]); +it("matches character ranges", () => { + expectMatch("[a-c]", ["a", "b", "c"]); + expectNotMatch("[a-c]", ["d", "e", ""]); + expectMatch("[K-M]", ["K", "L", "M"]); + expectNotMatch("[K-M]", ["9", "J"]); + expectMatch("[0-9]", ["0", "9"]); + expectNotMatch("[0-9]", ["a", "A"]); }); -it("word", () => { - expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]); - expectNotMatch("\\w", ["", "$"]); +it("matches multiple ranges", () => { + expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]); + expectNotMatch("[a-ce-f]", ["d"]); }); -it("not word", () => { - expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]); - expectMatch("\\W", ["&", "$"]); +it("supports closing brackets", () => { + expectMatch("[]a]", ["]", "a"]); }); -it("whitespace", () => { - expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]); - expectNotMatch("\\s", ["", "a", "0"]); +it("supports negated sets", () => { + expectNotMatch("[^a-c]", ["a", "b", "c"]); + expectMatch("[^a-c]", ["d", "e"]); + expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]); + expectMatch("[^a-ce-f]", ["d"]); }); -it("not whitespace", () => { - expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]); - expectMatch("\\S", ["a", "0"]); +it("treats - as a literal", () => { + expectMatch("[-abc]", ["-", "a", "b", "c"]); + expectMatch("[abc-]", ["-", "a", "b", "c"]); }); -it("tab, cr, lf, vt, ff", () => { - expectMatch("\\t", ["\t"]); - expectMatch("\\r", ["\r"]); - expectMatch("\\n", ["\n"]); - expectMatch("\\v", ["\v"]); - expectMatch("\\f", ["\f"]); - expectNotMatch("\\t", ["a", " ", ""]); +it("treats - as a literal in negated sets", () => { + expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); + expectMatch("[^-abc]", ["1", "A"]); }); -it("escaped dot", () => { - expectMatch("\\.", ["."]); - expectNotMatch("\\.", ["", "a"]); -}); - -it("unrecognised character classes are treated as characters", () => { - expectMatch("\\g\\m", ["gm"]); +it("supports case insensitive matching", () => { + // simple ranges + expectMatch("[a-c]", ["A", "C", "a", "c"], "i"); + expectNotMatch("[a-c]", ["D", "d"], "i"); + // complex + expectMatch("[W-c]", ["W", "w", "C", "c"], "i"); + expectNotMatch("[W-c]", ["V", "v", "D", "d"], "i"); }); diff --git a/assembly/__tests__/character-sets.spec.ts b/assembly/__tests__/character-sets.spec.ts index aeb7cf1..553ac7f 100644 --- a/assembly/__tests__/character-sets.spec.ts +++ b/assembly/__tests__/character-sets.spec.ts @@ -1,54 +1,55 @@ -import { expectMatch, expectNotMatch } from "./utils"; +import { RegExp } from ".."; +import { expectMatch, expectNotMatch, exec } from "./utils"; -it("throws an error if no closing bracket is found", () => { - // expect(() => new RegExp("[abce")).toThrow(); +it("dot", () => { + expectMatch(".", [" ", "B", "|", "9"]); + expectNotMatch(".", ["", "\n"]); }); -it("matches discrete characters", () => { - expectMatch("[abce]", ["a", "b", "c", "e"]); - expectNotMatch("[abce]", ["", "f", "h"]); +it("digit", () => { + expectMatch("\\d", ["0", "9"]); + expectNotMatch("\\d", ["", "b"]); }); -it("matches character ranges", () => { - expectMatch("[a-c]", ["a", "b", "c"]); - expectNotMatch("[a-c]", ["d", "e", ""]); - expectMatch("[K-M]", ["K", "L", "M"]); - expectNotMatch("[K-M]", ["9", "J"]); - expectMatch("[0-9]", ["0", "9"]); - expectNotMatch("[0-9]", ["a", "A"]); +it("non-digit", () => { + expectNotMatch("\\D", ["0", "9", ""]); + expectMatch("\\D", ["b", "|"]); }); -it("matches multiple ranges", () => { - expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]); - expectNotMatch("[a-ce-f]", ["d"]); +it("word", () => { + expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]); + expectNotMatch("\\w", ["", "$"]); }); -it("supports closing brackets", () => { - expectMatch("[]a]", ["]", "a"]); +it("not word", () => { + expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]); + expectMatch("\\W", ["&", "$"]); }); -it("supports negated sets", () => { - expectNotMatch("[^a-c]", ["a", "b", "c"]); - expectMatch("[^a-c]", ["d", "e"]); - expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]); - expectMatch("[^a-ce-f]", ["d"]); +it("whitespace", () => { + expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]); + expectNotMatch("\\s", ["", "a", "0"]); }); -it("treats - as a literal", () => { - expectMatch("[-abc]", ["-", "a", "b", "c"]); - expectMatch("[abc-]", ["-", "a", "b", "c"]); +it("not whitespace", () => { + expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]); + expectMatch("\\S", ["a", "0"]); }); -it("treats - as a literal in negated sets", () => { - expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); - expectMatch("[^-abc]", ["1", "A"]); +it("tab, cr, lf, vt, ff", () => { + expectMatch("\\t", ["\t"]); + expectMatch("\\r", ["\r"]); + expectMatch("\\n", ["\n"]); + expectMatch("\\v", ["\v"]); + expectMatch("\\f", ["\f"]); + expectNotMatch("\\t", ["a", " ", ""]); }); -it("supports case insensitive matching", () => { - // simple ranges - expectMatch("[a-c]", ["A", "C", "a", "c"], "i"); - expectNotMatch("[a-c]", ["D", "d"], "i"); - // complex - expectMatch("[W-c]", ["W", "w", "C", "c"], "i"); - expectNotMatch("[W-c]", ["V", "v", "D", "d"], "i"); +it("escaped dot", () => { + expectMatch("\\.", ["."]); + expectNotMatch("\\.", ["", "a"]); +}); + +it("unrecognised character classes are treated as characters", () => { + expectMatch("\\g\\m", ["gm"]); }); diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts index 5758c8e..64b080b 100644 --- a/assembly/nfa/matcher.ts +++ b/assembly/nfa/matcher.ts @@ -2,8 +2,8 @@ import { isDigit, isAlpha, isWhitespace, Char } from "../char"; import { CharacterNode, - CharacterSetNode, CharacterClassNode, + CharacterSetNode, CharacterRangeNode, NodeType, } from "../parser/node"; @@ -13,8 +13,8 @@ import { Range } from "../util"; const enum MatcherType { Character, CharacterRange, - CharacterClass, CharacterSet, + CharacterClass, } let _flags: Flags; @@ -27,10 +27,10 @@ export class Matcher { } static fromCharacterClassNode( - node: CharacterClassNode, + node: CharacterSetNode, flags: Flags - ): CharacterClassMatcher { - return new CharacterClassMatcher(node.charClass, flags.dotAll); + ): CharacterSetMatcher { + return new CharacterSetMatcher(node.charClass, flags.dotAll); } static fromCharacterRangeNode( @@ -44,9 +44,9 @@ export class Matcher { } static fromCharacterSetNode( - node: CharacterSetNode, + node: CharacterClassNode, flags: Flags - ): CharacterSetMatcher { + ): CharacterClassMatcher { _flags = flags; const matchers = node.expressions.map((exp) => { switch (exp.type) { @@ -57,16 +57,16 @@ export class Matcher { ); case NodeType.Character: return Matcher.fromCharacterNode(exp as CharacterNode, _flags); - case NodeType.CharacterClass: + case NodeType.CharacterSet: return Matcher.fromCharacterClassNode( - exp as CharacterClassNode, + exp as CharacterSetNode, _flags ); default: throw new Error("unsupported node type within character set"); } }); - return new CharacterSetMatcher(matchers, node.negated); + return new CharacterClassMatcher(matchers, node.negated); } static fromCharacterNode( @@ -126,9 +126,9 @@ export class CharacterRangeMatcher extends Matcher { } } -export class CharacterClassMatcher extends Matcher { +export class CharacterSetMatcher extends Matcher { constructor(public charClass: Char, private dotAll: bool) { - super(MatcherType.CharacterClass); + super(MatcherType.CharacterSet); } matches(code: u32): bool { @@ -171,9 +171,9 @@ export class CharacterClassMatcher extends Matcher { } } -export class CharacterSetMatcher extends Matcher { +export class CharacterClassMatcher extends Matcher { constructor(public matchers: Matcher[], public negated: bool) { - super(MatcherType.CharacterSet); + super(MatcherType.CharacterClass); } matches(code: u32): bool { @@ -189,13 +189,13 @@ export class CharacterSetMatcher extends Matcher { match = (matcher as CharacterRangeMatcher).matches(code); break; - case MatcherType.CharacterClass: - match = (matcher as CharacterClassMatcher).matches(code); - break; - case MatcherType.CharacterSet: match = (matcher as CharacterSetMatcher).matches(code); break; + + case MatcherType.CharacterClass: + match = (matcher as CharacterClassMatcher).matches(code); + break; } if (match) break; } diff --git a/assembly/nfa/nfa.ts b/assembly/nfa/nfa.ts index b0ff21e..9eba149 100644 --- a/assembly/nfa/nfa.ts +++ b/assembly/nfa/nfa.ts @@ -5,8 +5,8 @@ import { ConcatenationNode, RepetitionNode, AlternationNode, - CharacterSetNode, CharacterClassNode, + CharacterSetNode, GroupNode, NodeType, } from "../parser/node"; @@ -222,17 +222,17 @@ class AutomataFactor { this.automataForNode(node.right) ); } - case NodeType.CharacterSet: + case NodeType.CharacterClass: return Automata.fromMatcher( Matcher.fromCharacterSetNode( - expression as CharacterSetNode, + expression as CharacterClassNode, this.flags ) ); - case NodeType.CharacterClass: + case NodeType.CharacterSet: return Automata.fromMatcher( Matcher.fromCharacterClassNode( - expression as CharacterClassNode, + expression as CharacterSetNode, this.flags ) ); diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts index ec8524b..56a40cf 100644 --- a/assembly/parser/node.ts +++ b/assembly/parser/node.ts @@ -7,8 +7,8 @@ export const enum NodeType { Alternation, Concatenation, Character, - CharacterSet, CharacterClass, + CharacterSet, CharacterRange, Repetition, RangeRepetition, @@ -72,13 +72,13 @@ export class ConcatenationNode extends Node { } } -export class CharacterSetNode extends Node { +export class CharacterClassNode extends Node { constructor(public expressions: Node[], public negated: bool) { - super(NodeType.CharacterSet); + super(NodeType.CharacterClass); } clone(): Node { - return new CharacterSetNode( + return new CharacterClassNode( this.expressions.slice(0).map((s) => s.clone()), this.negated ); @@ -126,13 +126,13 @@ export class AssertionNode extends Node { } } -export class CharacterClassNode extends Node { +export class CharacterSetNode extends Node { constructor(public charClass: Char) { - super(NodeType.CharacterClass); + super(NodeType.CharacterSet); } clone(): Node { - return new CharacterClassNode(this.charClass); + return new CharacterSetNode(this.charClass); } } diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index 98dcfdc..6bee00a 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -5,13 +5,13 @@ import { RangeRepetitionNode, GroupNode, AssertionNode, - CharacterClassNode, + CharacterSetNode, CharacterNode, Node, AlternationNode, ConcatenationNode, RepetitionNode, - CharacterSetNode, + CharacterClassNode, CharacterRangeNode, } from "./node"; @@ -138,7 +138,7 @@ export class Parser { } else if (token == Char.u) { return this.parseCharacterCode(Char.u); } else if (isCharacterClass(token)) { - return new CharacterClassNode(this.eatToken()); + return new CharacterSetNode(this.eatToken()); } else { return new CharacterNode(this.eatToken()); } @@ -150,7 +150,7 @@ export class Parser { if (token == Char.Dot) { this.eatToken(Char.Dot); - return new CharacterClassNode(Char.Dot); + return new CharacterSetNode(Char.Dot); } return new CharacterNode(this.eatToken()); @@ -243,7 +243,7 @@ export class Parser { nodes.push(new RepetitionNode(expression, quantifier, this.isGreedy())); // @ts-ignore } else if (token == Char.LeftSquareBracket) { - nodes.push(this.parseCharacterSet()); + nodes.push(this.parseCharacterClass()); } else { nodes.push(this.parseCharacter()); } @@ -259,7 +259,7 @@ export class Parser { return new CharacterRangeNode(from, to); } - private parseCharacterSet(): CharacterSetNode { + private parseCharacterClass(): CharacterClassNode { this.eatToken(Char.LeftSquareBracket); const negated = this.iterator.current == Char.Caret; @@ -288,7 +288,7 @@ export class Parser { nodes.push(new CharacterNode(this.eatToken())); } else { // otherwise this is a character class - nodes.push(new CharacterClassNode(this.eatToken())); + nodes.push(new CharacterSetNode(this.eatToken())); } } else { nodes.push(new CharacterNode(this.eatToken())); @@ -300,6 +300,6 @@ export class Parser { } } this.eatToken(Char.RightSquareBracket); - return new CharacterSetNode(nodes, negated); + return new CharacterClassNode(nodes, negated); } } From 8a039f46b57230da1e0c20061360dc4814176bb5 Mon Sep 17 00:00:00 2001 From: Colin E Date: Wed, 24 Feb 2021 07:28:52 +0000 Subject: [PATCH 2/3] test: further triage --- assembly/__spec_tests__/generated.spec.ts | 88 +++++++++++------------ spec/test-generator.js | 19 ++++- 2 files changed, 61 insertions(+), 46 deletions(-) diff --git a/assembly/__spec_tests__/generated.spec.ts b/assembly/__spec_tests__/generated.spec.ts index f535a5c..7799883 100644 --- a/assembly/__spec_tests__/generated.spec.ts +++ b/assembly/__spec_tests__/generated.spec.ts @@ -1003,9 +1003,9 @@ it("line: 185 - matches ^\\*\\.[a-z]([a-z\\-\\d]*[a-z\\d]+)?(\\.[a-z]([a-z\\-\\d ["*.c-a.0-c"] ); }); -xit("line: 186 - non capturing groups not supported", () => {}); -xit("line: 187 - non capturing groups not supported", () => {}); -xit("line: 188 - non capturing groups not supported", () => {}); +xit("line: 186 - lookaheads not supported", () => {}); +xit("line: 187 - lookaheads not supported", () => {}); +xit("line: 188 - lookaheads not supported", () => {}); it("line: 189 - matches ^[\\da-f](\\.[\\da-f])*$ against 'a.b.c.d'", () => { const match = exec("^[\\da-f](\\.[\\da-f])*$", "a.b.c.d", "mis"); expect(match.matches[0]).toBe("a.b.c.d".substring(0, 7)); @@ -1042,12 +1042,12 @@ it("line: 196 - matches ^$ against ''", () => { const match = exec("^$", "", "ms"); expect(match.matches[0]).toBe("".substring(0, 0)); }); -xit("line: 197 - non capturing groups not supported", () => {}); -xit("line: 198 - non capturing groups not supported", () => {}); -xit("line: 199 - non capturing groups not supported", () => {}); -xit("line: 200 - non capturing groups not supported", () => {}); -xit("line: 201 - non capturing groups not supported", () => {}); -xit("line: 202 - non capturing groups not supported", () => {}); +xit("line: 197 - JS regex does not support comments", () => {}); +xit("line: 198 - JS regex does not support comments", () => {}); +xit("line: 199 - JS regex does not support comments", () => {}); +xit("line: 200 - JS regex does not support comments", () => {}); +xit("line: 201 - JS regex does not support comments", () => {}); +xit("line: 202 - JS regex does not support comments", () => {}); xit("line: 203 - test appears to be incorrect?", () => {}); xit("line: 204 - test appears to be incorrect?", () => {}); it("line: 205 - matches ^ a\\ b[c ]d $ against 'abcd'", () => { @@ -1243,10 +1243,10 @@ it("line: 250 - matches ^[aeiou\\d]{4,5}? against '123456'", () => { xit("line: 251 - back references are not supported", () => {}); xit("line: 252 - back references are not supported", () => {}); xit("line: 253 - back references are not supported", () => {}); -xit("line: 254 - non capturing groups not supported", () => {}); -xit("line: 255 - non capturing groups not supported", () => {}); -xit("line: 256 - non capturing groups not supported", () => {}); -xit("line: 257 - non capturing groups not supported", () => {}); +xit("line: 254 - JS regex does not support comments", () => {}); +xit("line: 255 - JS regex does not support comments", () => {}); +xit("line: 256 - JS regex does not support comments", () => {}); +xit("line: 257 - JS regex does not support comments", () => {}); xit("line: 258 - back references are not supported", () => {}); xit("line: 259 - back references are not supported", () => {}); xit("line: 260 - back references are not supported", () => {}); @@ -1303,23 +1303,23 @@ it("line: 266 - matches ^12.34 against '12\r34'", () => { const match = exec("^12.34", "12\r34", "ms"); expect(match.matches[0]).toBe("12\r34".substring(0, 5)); }); -xit("line: 267 - non capturing groups not supported", () => {}); -xit("line: 268 - non capturing groups not supported", () => {}); +xit("line: 267 - lookaheads not supported", () => {}); +xit("line: 268 - lookaheads not supported", () => {}); xit("line: 269 - non capturing groups not supported", () => {}); xit("line: 270 - non capturing groups not supported", () => {}); xit("line: 271 - non capturing groups not supported", () => {}); xit("line: 272 - non capturing groups not supported", () => {}); -xit("line: 273 - non capturing groups not supported", () => {}); -xit("line: 274 - non capturing groups not supported", () => {}); +xit("line: 273 - lookaheads not supported", () => {}); +xit("line: 274 - lookaheads not supported", () => {}); xit("line: 281 - test regex contains syntax not supported in JS", () => {}); xit("line: 282 - back references are not supported", () => {}); xit("line: 283 - back references are not supported", () => {}); xit("line: 284 - back references are not supported", () => {}); xit("line: 285 - back references are not supported", () => {}); -xit("line: 286 - non capturing groups not supported", () => {}); -xit("line: 287 - non capturing groups not supported", () => {}); -xit("line: 288 - non capturing groups not supported", () => {}); -xit("line: 289 - non capturing groups not supported", () => {}); +xit("line: 286 - lookaheads not supported", () => {}); +xit("line: 287 - lookaheads not supported", () => {}); +xit("line: 288 - lookaheads not supported", () => {}); +xit("line: 289 - lookaheads not supported", () => {}); xit("line: 290 - the test behaviour differs between PCRE and JS", () => {}); it("line: 291 - matches ^[ab]{1,3}?(ab*|b) against 'aabbbbb'", () => { const match = exec("^[ab]{1,3}?(ab*|b)", "aabbbbb", "ms"); @@ -1676,7 +1676,7 @@ xit("line: 1217 - back references are not supported", () => {}); xit("line: 1218 - back references are not supported", () => {}); xit("line: 1219 - back references are not supported", () => {}); xit("line: 1220 - back references are not supported", () => {}); -xit("line: 1221 - non capturing groups not supported", () => {}); +xit("line: 1221 - back references are not supported", () => {}); it("line: 1223 - matches ab\\gdef against 'abgdef'", () => { const match = exec("ab\\gdef", "abgdef", "ms"); expect(match.matches[0]).toBe("abgdef".substring(0, 6)); @@ -1694,7 +1694,7 @@ xit("line: 1227 - back references are not supported", () => {}); xit("line: 1228 - back references are not supported", () => {}); xit("line: 1229 - back references are not supported", () => {}); xit("line: 1230 - back references are not supported", () => {}); -xit("line: 1231 - non capturing groups not supported", () => {}); +xit("line: 1231 - JS regex does not support mode modifiers", () => {}); xit("line: 1232 - word boundary class not supported yet!", () => {}); xit("line: 1233 - word boundary class not supported yet!", () => {}); xit("line: 1234 - word boundary class not supported yet!", () => {}); @@ -1839,12 +1839,12 @@ it("line: 1273 - matches (\\.\\d\\d[1-9]?)\\d+ against '1.235 '", () => { expect(match.matches[0]).toBe("1.235 ".substring(1, 5)); expect(match.matches[1]).toBe("1.235 ".substring(1, 4)); }); -xit("line: 1274 - non capturing groups not supported", () => {}); -xit("line: 1275 - non capturing groups not supported", () => {}); -xit("line: 1276 - non capturing groups not supported", () => {}); -xit("line: 1277 - non capturing groups not supported", () => {}); -xit("line: 1278 - non capturing groups not supported", () => {}); -xit("line: 1279 - non capturing groups not supported", () => {}); +xit("line: 1274 - lookaheads not supported", () => {}); +xit("line: 1275 - lookaheads not supported", () => {}); +xit("line: 1276 - lookaheads not supported", () => {}); +xit("line: 1277 - lookaheads not supported", () => {}); +xit("line: 1278 - the test behaviour differs between PCRE and JS", () => {}); +xit("line: 1279 - JS regex does not support comments", () => {}); xit("line: 1280 - word boundary class not supported yet!", () => {}); it("line: 1281 - matches foo(.*)bar against 'The food is under the bar in the barn.'", () => { const match = exec( @@ -1910,9 +1910,9 @@ it("line: 1290 - matches (.*\\D)(\\d+)$ against 'I have 2 numbers: 53147'", () = expect(match.matches[1]).toBe("I have 2 numbers: 53147".substring(0, 18)); expect(match.matches[2]).toBe("I have 2 numbers: 53147".substring(18, 23)); }); -xit("line: 1291 - non capturing groups not supported", () => {}); -xit("line: 1292 - non capturing groups not supported", () => {}); -xit("line: 1293 - non capturing groups not supported", () => {}); +xit("line: 1291 - lookaheads not supported", () => {}); +xit("line: 1292 - lookaheads not supported", () => {}); +xit("line: 1293 - lookaheads not supported", () => {}); it("line: 1294 - matches ^[W-]46] against 'W46]789 '", () => { const match = exec("^[W-]46]", "W46]789 ", "ms"); expect(match.matches[0]).toBe("W46]789 ".substring(0, 4)); @@ -2211,12 +2211,12 @@ it("line: 1372 - matches (.*X|^B) against 'abcde\nBar '", () => { expect(match.matches[0]).toBe("abcde\nBar ".substring(6, 7)); expect(match.matches[1]).toBe("abcde\nBar ".substring(6, 7)); }); -xit("line: 1373 - non capturing groups not supported", () => {}); -xit("line: 1374 - non capturing groups not supported", () => {}); -xit("line: 1375 - non capturing groups not supported", () => {}); -xit("line: 1376 - non capturing groups not supported", () => {}); -xit("line: 1377 - non capturing groups not supported", () => {}); -xit("line: 1378 - non capturing groups not supported", () => {}); +xit("line: 1373 - JS regex does not support mode modifiers", () => {}); +xit("line: 1374 - JS regex does not support mode modifiers", () => {}); +xit("line: 1375 - JS regex does not support mode modifiers", () => {}); +xit("line: 1376 - JS regex does not support mode modifiers", () => {}); +xit("line: 1377 - JS regex does not support mode modifiers", () => {}); +xit("line: 1378 - JS regex does not support mode modifiers", () => {}); it("line: 1379 - matches ^.*B against 'abc\nB'", () => { const match = exec("^.*B", "abc\nB", "ms"); expect(match.matches[0]).toBe("abc\nB".substring(0, 5)); @@ -2225,12 +2225,12 @@ it("line: 1380 - matches ^.*B against 'abc\nB'", () => { const match = exec("^.*B", "abc\nB", "m"); expect(match.matches[0]).toBe("abc\nB".substring(4, 5)); }); -xit("line: 1381 - non capturing groups not supported", () => {}); -xit("line: 1382 - non capturing groups not supported", () => {}); -xit("line: 1383 - non capturing groups not supported", () => {}); -xit("line: 1384 - non capturing groups not supported", () => {}); -xit("line: 1385 - non capturing groups not supported", () => {}); -xit("line: 1386 - non capturing groups not supported", () => {}); +xit("line: 1381 - JS regex does not support mode modifiers", () => {}); +xit("line: 1382 - JS regex does not support mode modifiers", () => {}); +xit("line: 1383 - JS regex does not support mode modifiers", () => {}); +xit("line: 1384 - JS regex does not support mode modifiers", () => {}); +xit("line: 1385 - JS regex does not support mode modifiers", () => {}); +xit("line: 1386 - JS regex does not support mode modifiers", () => {}); it("line: 1387 - matches ^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9] against '123456654321'", () => { const match = exec( "^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]", diff --git a/spec/test-generator.js b/spec/test-generator.js index 2ebd861..fca176a 100644 --- a/spec/test-generator.js +++ b/spec/test-generator.js @@ -47,7 +47,7 @@ const knownIssues = { 1164, ], "test regex contains syntax not supported in JS": [82, 1158, 281], - "the test behaviour differs between PCRE and JS": [290], + "the test behaviour differs between PCRE and JS": [290, 1278], }; const hasKnownIssue = (index) => { @@ -108,11 +108,26 @@ lines.forEach((line, index) => { return; } - if (["(?"].some((f) => regex.includes(f))) { + if (["(?:"].some((f) => regex.includes(f))) { testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`; return; } + if (["(?!", "(?="].some((f) => regex.includes(f))) { + testCase += `xit("line: ${index} - lookaheads not supported", () => {});`; + return; + } + + if (["(?m", "(?s", "(?ms"].some((f) => regex.includes(f))) { + testCase += `xit("line: ${index} - JS regex does not support mode modifiers", () => {});`; + return; + } + + if (["(?#"].some((f) => regex.includes(f))) { + testCase += `xit("line: ${index} - JS regex does not support comments", () => {});`; + return; + } + if (regex.match(/\\\\\d{1}/)) { testCase += `xit("line: ${index} - back references are not supported", () => {});`; return; From 6fec3efb1a1975afa4aab2a08818e324ca28c52a Mon Sep 17 00:00:00 2001 From: Colin E Date: Wed, 24 Feb 2021 09:46:25 +0000 Subject: [PATCH 3/3] feat: implemented non-capturing groups --- README.md | 2 +- assembly/__spec_tests__/generated.spec.ts | 53 ++++++++++++++++++----- assembly/__tests__/capture-group.spec.ts | 6 +++ assembly/char.ts | 1 + assembly/nfa/nfa.ts | 22 ++++++---- assembly/parser/node.ts | 8 +++- assembly/parser/parser.ts | 15 ++++++- assembly/regexp.ts | 11 +++-- spec/test-generator.js | 6 +-- ts/index.ts | 13 ++---- 10 files changed, 97 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 477c678..c9179e4 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Based on the classfication within the [MDN cheatsheet](https://developer.mozilla - [x] (x) capturing group - [ ] \n back reference - [ ] (?x) named capturing group -- [ ] (?:x) Non-capturing group +- [x] (?:x) Non-capturing group **Quantifiers** diff --git a/assembly/__spec_tests__/generated.spec.ts b/assembly/__spec_tests__/generated.spec.ts index 7799883..3946808 100644 --- a/assembly/__spec_tests__/generated.spec.ts +++ b/assembly/__spec_tests__/generated.spec.ts @@ -1076,7 +1076,22 @@ it("line: 207 - matches ^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$ against 'abcdefhi expect(match.matches[11]).toBe("abcdefhijklm".substring(10, 12)); expect(match.matches[12]).toBe("abcdefhijklm".substring(11, 12)); }); -xit("line: 208 - non capturing groups not supported", () => {}); +it("line: 208 - matches ^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$ against 'abcdefhijklm'", () => { + const match = exec( + "^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$", + "abcdefhijklm", + "ms" + ); + expect(match.matches[0]).toBe("abcdefhijklm".substring(0, 12)); + expect(match.matches[1]).toBe("abcdefhijklm".substring(1, 3)); + expect(match.matches[2]).toBe("abcdefhijklm".substring(2, 3)); + expect(match.matches[3]).toBe("abcdefhijklm".substring(4, 6)); + expect(match.matches[4]).toBe("abcdefhijklm".substring(5, 6)); + expect(match.matches[5]).toBe("abcdefhijklm".substring(7, 9)); + expect(match.matches[6]).toBe("abcdefhijklm".substring(8, 9)); + expect(match.matches[7]).toBe("abcdefhijklm".substring(10, 12)); + expect(match.matches[8]).toBe("abcdefhijklm".substring(11, 12)); +}); xit("line: 209 - back references are not supported", () => {}); it("line: 210 - matches ^[.^$|()*+?{,}]+ against '.^$(*+)|{?,?}'", () => { const match = exec("^[.^$|()*+?{,}]+", ".^$(*+)|{?,?}", "ms"); @@ -1305,10 +1320,10 @@ it("line: 266 - matches ^12.34 against '12\r34'", () => { }); xit("line: 267 - lookaheads not supported", () => {}); xit("line: 268 - lookaheads not supported", () => {}); -xit("line: 269 - non capturing groups not supported", () => {}); -xit("line: 270 - non capturing groups not supported", () => {}); -xit("line: 271 - non capturing groups not supported", () => {}); -xit("line: 272 - non capturing groups not supported", () => {}); +xit("line: 269 - lookaheads not supported", () => {}); +xit("line: 270 - lookaheads not supported", () => {}); +xit("line: 271 - lookaheads not supported", () => {}); +xit("line: 272 - lookaheads not supported", () => {}); xit("line: 273 - lookaheads not supported", () => {}); xit("line: 274 - lookaheads not supported", () => {}); xit("line: 281 - test regex contains syntax not supported in JS", () => {}); @@ -1564,8 +1579,14 @@ it("line: 1162 - matches \\Aabc\\Z against 'qqq\nabc\nzzz'", () => { }); xit("line: 1163 - JS does not support the A Z syntax for start and end of string", () => {}); xit("line: 1164 - JS does not support the A Z syntax for start and end of string", () => {}); -xit("line: 1165 - non capturing groups not supported", () => {}); -xit("line: 1166 - non capturing groups not supported", () => {}); +it("line: 1165 - matches (?:b)|(?::+) against 'b::c'", () => { + const match = exec("(?:b)|(?::+)", "b::c", "ms"); + expect(match.matches[0]).toBe("b::c".substring(0, 1)); +}); +it("line: 1166 - matches (?:b)|(?::+) against 'c::b'", () => { + const match = exec("(?:b)|(?::+)", "c::b", "ms"); + expect(match.matches[0]).toBe("c::b".substring(1, 3)); +}); it("line: 1167 - matches [-az]+ against 'az-'", () => { const match = exec("[-az]+", "az-", "ms"); expect(match.matches[0]).toBe("az-".substring(0, 3)); @@ -1954,9 +1975,21 @@ it("line: 1311 - matches \\d\\d\\/\\d\\d\\/\\d\\d\\d\\d against '01/01/2000'", ( const match = exec("\\d\\d\\/\\d\\d\\/\\d\\d\\d\\d", "01/01/2000", "ms"); expect(match.matches[0]).toBe("01/01/2000".substring(0, 10)); }); -xit("line: 1312 - non capturing groups not supported", () => {}); -xit("line: 1313 - non capturing groups not supported", () => {}); -xit("line: 1314 - non capturing groups not supported", () => {}); +it("line: 1312 - matches word (?:[a-zA-Z0-9]+ ){0,10}otherword against 'word cat dog elephant mussel cow horse canary baboon snake shark otherword'", () => { + const match = exec( + "word (?:[a-zA-Z0-9]+ ){0,10}otherword", + "word cat dog elephant mussel cow horse canary baboon snake shark otherword", + "ms" + ); + expect(match.matches[0]).toBe( + "word cat dog elephant mussel cow horse canary baboon snake shark otherword".substring( + 0, + 74 + ) + ); +}); +xit("line: 1313 - peformance issue", () => {}); +xit("line: 1314 - peformance issue", () => {}); it("line: 1315 - matches ^(a){0,0} against 'bcd'", () => { const match = exec("^(a){0,0}", "bcd", "ms"); expect(match.matches[0]).toBe("bcd".substring(0, 0)); diff --git a/assembly/__tests__/capture-group.spec.ts b/assembly/__tests__/capture-group.spec.ts index ad11ccf..658d501 100644 --- a/assembly/__tests__/capture-group.spec.ts +++ b/assembly/__tests__/capture-group.spec.ts @@ -45,3 +45,9 @@ it("range repitition capture groups should return the last match", () => { expect(match.matches[0]).toBe("ac"); expect(match.matches[1]).toBe("c"); }); + +it("non-capturing groups should not capture", () => { + const match = exec("(?:foo)bar(baz)", "foobarbaz"); + expect(match.matches[0]).toBe("foobarbaz"); + expect(match.matches[1]).toBe("baz"); +}); diff --git a/assembly/char.ts b/assembly/char.ts index 4a78d49..6028514 100644 --- a/assembly/char.ts +++ b/assembly/char.ts @@ -16,6 +16,7 @@ export const enum Char { Dot = 0x2e, // "." Zero = 0x30, Nine = 0x39, + Colon = 0x3a, Question = 0x3f, // "?" A = 0x41, D = 0x44, diff --git a/assembly/nfa/nfa.ts b/assembly/nfa/nfa.ts index 9eba149..9953a81 100644 --- a/assembly/nfa/nfa.ts +++ b/assembly/nfa/nfa.ts @@ -42,7 +42,7 @@ export class GroupStartMarkerState extends State { // captures from the path through the NFA that reaches the end are flagged flagged: bool = false; - constructor(next: State, public groupId: i32) { + constructor(next: State, public capturing: bool, public groupId: i32) { super(); this.transitions.push(next); } @@ -60,10 +60,12 @@ export class GroupEndMarkerState extends State { } matches(input: string, position: u32): MatchResult { - this.startMarker.capture = input.substring( - this.startMarker.location, - position - ); + if (this.startMarker.capturing) { + this.startMarker.capture = input.substring( + this.startMarker.location, + position + ); + } return MatchResult.Ignore; } } @@ -164,10 +166,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata { return new Automata(start, end); } -function group(nfa: Automata, id: i32): Automata { +function group(nfa: Automata, capturing: bool, id: i32): Automata { // groups are implemented by wrapping the automata with // a pair of markers that record matches - const startMarker = new GroupStartMarkerState(nfa.start, id); + const startMarker = new GroupStartMarkerState(nfa.start, capturing, id); const end = new State(); const endMarker = new GroupEndMarkerState(end, startMarker); nfa.end.transitions.push(endMarker); @@ -238,7 +240,11 @@ class AutomataFactor { ); case NodeType.Group: { const node = expression as GroupNode; - return group(this.automataForNode(node.expression), node.id); + return group( + this.automataForNode(node.expression), + node.capturing, + node.id + ); } case NodeType.Assertion: return Automata.fromEpsilon(); diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts index 56a40cf..393b519 100644 --- a/assembly/parser/node.ts +++ b/assembly/parser/node.ts @@ -209,7 +209,11 @@ export class AlternationNode extends Node { let _id = 0; export class GroupNode extends Node { - constructor(public expression: Node, public id: i32 = -1) { + constructor( + public expression: Node, + public capturing: bool, + public id: i32 = -1 + ) { super(NodeType.Group); if (id == -1) { this.id = _id++; @@ -221,7 +225,7 @@ export class GroupNode extends Node { } clone(): Node { - return new GroupNode(this.expression.clone(), this.id); + return new GroupNode(this.expression.clone(), this.capturing, this.id); } replace(node: Node, replacement: Node): void { diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index 6bee00a..dad0242 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -204,6 +204,18 @@ export class Parser { return true; } + private isCapturing(): bool { + if ( + this.iterator.current == Char.Question && + this.iterator.lookahead(1) == Char.Colon + ) { + this.eatToken(Char.Question); + this.eatToken(Char.Colon); + return false; + } + return true; + } + // parses a sequence of chars private parseSequence(): Node { let nodes = new Array(); @@ -218,7 +230,8 @@ export class Parser { // @ts-ignore } else if (token == Char.LeftParenthesis) { this.eatToken(Char.LeftParenthesis); - nodes.push(new GroupNode(this.parseSequence())); + const capturing = this.isCapturing(); + nodes.push(new GroupNode(this.parseSequence(), capturing)); this.eatToken(Char.RightParenthesis); // @ts-ignore } else if (token == Char.LeftCurlyBrace) { diff --git a/assembly/regexp.ts b/assembly/regexp.ts index 8004736..8620036 100644 --- a/assembly/regexp.ts +++ b/assembly/regexp.ts @@ -89,9 +89,9 @@ export class Flags { // capture groups are implemented as GroupStart / GroupEnd states that record (capture) // the value of the current state of the string being matched. -// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The +// Repeated capture groups, via range repetitions (e.g. {2,3}) share the same 'id'. The // returned regex should only return the value of the final repetition. -function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] { +function lastCapturesForGroup(groupMarkers: GroupStartMarkerState[]): string[] { if (!groupMarkers.length) { return []; } @@ -139,7 +139,10 @@ export class RegExp { gm = new Array(); nfaWalker(this.nfa.start, (state) => { if (state instanceof GroupStartMarkerState) { - gm.push(state as GroupStartMarkerState); + const startMarker = state as GroupStartMarkerState; + if (startMarker.capturing) { + gm.push(state as GroupStartMarkerState); + } } }); this.groupMarkers = gm; @@ -181,7 +184,7 @@ export class RegExp { }); const match = new Match( - [matchStr!].concat(filterCaptures(groupMarkers)), + [matchStr!].concat(lastCapturesForGroup(groupMarkers)), matchIndex, str ); diff --git a/spec/test-generator.js b/spec/test-generator.js index fca176a..649ec8b 100644 --- a/spec/test-generator.js +++ b/spec/test-generator.js @@ -22,6 +22,7 @@ const knownIssues = { ...range(141, 143), 1288, ], + "peformance issue": [1313, 1314], /* -------- issues with the tests ------------ */ "test appears to be incorrect?": [203, 204], @@ -108,11 +109,6 @@ lines.forEach((line, index) => { return; } - if (["(?:"].some((f) => regex.includes(f))) { - testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`; - return; - } - if (["(?!", "(?="].some((f) => regex.includes(f))) { testCase += `xit("line: ${index} - lookaheads not supported", () => {});`; return; diff --git a/ts/index.ts b/ts/index.ts index c83b84b..14d10dd 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,13 +5,8 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("abc$", "m"); -let match = regexObj.exec("abc\n"); +const regexObj = new RegExp("word (?:[a-zA-Z0-9]+ ){0,300}otherword", ""); +let match = regexObj.exec( + "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope" +); console.log(JSON.stringify(match, null, 2)); -// match = regexObj.exec("f1\nbar\nbaz\nf2"); -// console.log(JSON.stringify(match, null, 2)); - -// const regex = new RegExp("^f\\d{1}$", "gm"); - -// let match = regex.exec("f1\nbar\nbaz\nf2"); -// expect(match!.matches[0]).toBe("f1");