From d2127dc86e4700b4904aaf0ea8ad9fe028fb4e26 Mon Sep 17 00:00:00 2001 From: Colin E Date: Thu, 4 Feb 2021 20:46:49 +0000 Subject: [PATCH 1/4] feat: added not greedy range repitition --- assembly/__spec_tests__/generated.spec.ts | 142 +++++++++++++++---- assembly/__tests__/quantifiers.spec.ts | 12 +- assembly/__tests__/range-quantifiers.spec.ts | 5 + assembly/nfa/nfa.ts | 13 +- assembly/parser/node.ts | 7 +- assembly/parser/parser.ts | 9 +- assembly/parser/string-iterator.ts | 1 + assembly/parser/walker.ts | 16 ++- spec/test-generator.js | 14 +- ts/index.ts | 4 +- 10 files changed, 172 insertions(+), 51 deletions(-) diff --git a/assembly/__spec_tests__/generated.spec.ts b/assembly/__spec_tests__/generated.spec.ts index 551aee1..537c422 100644 --- a/assembly/__spec_tests__/generated.spec.ts +++ b/assembly/__spec_tests__/generated.spec.ts @@ -368,16 +368,28 @@ it("line: 49 - matches ^(abc){1,2}zz against 'abcabcabczz'", () => { it("line: 50 - matches ^(abc){1,2}zz against '>>abczz'", () => { expectNotMatch("^(abc){1,2}zz", [">>abczz"]); }); -xit("line: 51 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 52 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 53 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 54 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 55 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 56 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 57 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 58 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 59 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 60 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => { + const match = exec("^(b+?|a){1,2}?c", "bc", "s"); + expect(match.matches[0]).toBe("bc".substring(0, 2)); + expect(match.matches[1]).toBe("bc".substring(0, 1)); +}); +xit("line: 52 - issues with repeated capture groups", () => {}); +xit("line: 53 - issues with repeated capture groups", () => {}); +xit("line: 54 - issues with repeated capture groups", () => {}); +xit("line: 55 - issues with repeated capture groups", () => {}); +it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => { + const match = exec("^(b+?|a){1,2}?c", "aac", "s"); + expect(match.matches[0]).toBe("aac".substring(0, 3)); + expect(match.matches[1]).toBe("aac".substring(1, 2)); +}); +xit("line: 57 - issues with repeated capture groups", () => {}); +xit("line: 58 - issues with repeated capture groups", () => {}); +it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => { + expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]); +}); +it("line: 60 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbac'", () => { + expectNotMatch("^(b+?|a){1,2}?c", ["abbbbbbbbbbbac"]); +}); it("line: 61 - matches ^(b+|a){1,2}c against 'bc'", () => { const match = exec("^(b+|a){1,2}c", "bc", "s"); expect(match.matches[0]).toBe("bc".substring(0, 2)); @@ -400,17 +412,41 @@ it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => { it("line: 70 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbac'", () => { expectNotMatch("^(b+|a){1,2}c", ["abbbbbbbbbbbac"]); }); -xit("line: 71 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 72 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 73 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 74 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 75 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 76 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 77 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 78 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 79 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 80 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 81 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => { + const match = exec("^(b+|a){1,2}?bc", "bbc", "s"); + expect(match.matches[0]).toBe("bbc".substring(0, 3)); + expect(match.matches[1]).toBe("bbc".substring(0, 1)); +}); +xit("line: 72 - issues with repeated capture groups", () => {}); +xit("line: 73 - issues with repeated capture groups", () => {}); +it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => { + const match = exec("^(b*|ba){1,2}?bc", "bababc", "s"); + expect(match.matches[0]).toBe("bababc".substring(0, 6)); + expect(match.matches[1]).toBe("bababc".substring(2, 4)); +}); +it("line: 75 - matches ^(b*|ba){1,2}?bc against 'bababbc'", () => { + expectNotMatch("^(b*|ba){1,2}?bc", ["bababbc"]); +}); +it("line: 76 - matches ^(b*|ba){1,2}?bc against 'babababc'", () => { + expectNotMatch("^(b*|ba){1,2}?bc", ["babababc"]); +}); +it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => { + const match = exec("^(ba|b*){1,2}?bc", "babc", "s"); + expect(match.matches[0]).toBe("babc".substring(0, 4)); + expect(match.matches[1]).toBe("babc".substring(0, 2)); +}); +xit("line: 78 - issues with repeated capture groups", () => {}); +it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => { + const match = exec("^(ba|b*){1,2}?bc", "bababc", "s"); + expect(match.matches[0]).toBe("bababc".substring(0, 6)); + expect(match.matches[1]).toBe("bababc".substring(2, 4)); +}); +it("line: 80 - matches ^(ba|b*){1,2}?bc against 'bababbc'", () => { + expectNotMatch("^(ba|b*){1,2}?bc", ["bababbc"]); +}); +it("line: 81 - matches ^(ba|b*){1,2}?bc against 'babababc'", () => { + expectNotMatch("^(ba|b*){1,2}?bc", ["babababc"]); +}); xit("line: 82 - test regex contains syntax not supported in JS", () => {}); it("line: 83 - matches ^[ab\\]cde] against 'athing'", () => { const match = exec("^[ab\\]cde]", "athing", "s"); @@ -1120,11 +1156,26 @@ it("line: 244 - matches ^[aeiou\\d]{4,5}$ against 'aaaaa'", () => { it("line: 245 - matches ^[aeiou\\d]{4,5}$ against '123456'", () => { expectNotMatch("^[aeiou\\d]{4,5}$", ["123456"]); }); -xit("line: 246 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 247 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 248 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 249 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 250 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 246 - matches ^[aeiou\\d]{4,5}? against 'uoie'", () => { + const match = exec("^[aeiou\\d]{4,5}?", "uoie", "s"); + expect(match.matches[0]).toBe("uoie".substring(0, 4)); +}); +it("line: 247 - matches ^[aeiou\\d]{4,5}? against '1234'", () => { + const match = exec("^[aeiou\\d]{4,5}?", "1234", "s"); + expect(match.matches[0]).toBe("1234".substring(0, 4)); +}); +it("line: 248 - matches ^[aeiou\\d]{4,5}? against '12345'", () => { + const match = exec("^[aeiou\\d]{4,5}?", "12345", "s"); + expect(match.matches[0]).toBe("12345".substring(0, 4)); +}); +it("line: 249 - matches ^[aeiou\\d]{4,5}? against 'aaaaa'", () => { + const match = exec("^[aeiou\\d]{4,5}?", "aaaaa", "s"); + expect(match.matches[0]).toBe("aaaaa".substring(0, 4)); +}); +it("line: 250 - matches ^[aeiou\\d]{4,5}? against '123456'", () => { + const match = exec("^[aeiou\\d]{4,5}?", "123456", "s"); + expect(match.matches[0]).toBe("123456".substring(0, 4)); +}); xit("line: 251 - back references are not supported", () => {}); xit("line: 252 - back references are not supported", () => {}); xit("line: 253 - back references are not supported", () => {}); @@ -1182,8 +1233,16 @@ xit("line: 287 - non capturing groups not supported", () => {}); xit("line: 288 - non capturing groups not supported", () => {}); xit("line: 289 - non capturing groups not supported", () => {}); xit("line: 290 - the test behaviour differs between PCRE and JS", () => {}); -xit("line: 291 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 292 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 291 - matches ^[ab]{1,3}?(ab*|b) against 'aabbbbb'", () => { + const match = exec("^[ab]{1,3}?(ab*|b)", "aabbbbb", "s"); + expect(match.matches[0]).toBe("aabbbbb".substring(0, 7)); + expect(match.matches[1]).toBe("aabbbbb".substring(1, 7)); +}); +it("line: 292 - matches ^[ab]{1,3}?(ab*?|b) against 'aabbbbb'", () => { + const match = exec("^[ab]{1,3}?(ab*?|b)", "aabbbbb", "s"); + expect(match.matches[0]).toBe("aabbbbb".substring(0, 2)); + expect(match.matches[1]).toBe("aabbbbb".substring(1, 2)); +}); it("line: 293 - matches ^[ab]{1,3}(ab*?|b) against 'aabbbbb'", () => { const match = exec("^[ab]{1,3}(ab*?|b)", "aabbbbb", "s"); expect(match.matches[0]).toBe("aabbbbb".substring(0, 4)); @@ -1503,7 +1562,10 @@ it("line: 1224 - matches a{0}bc against 'bc'", () => { const match = exec("a{0}bc", "bc", "s"); expect(match.matches[0]).toBe("bc".substring(0, 2)); }); -xit("line: 1225 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 1225 - matches (a|(bc)){0,0}?xyz against 'xyz'", () => { + const match = exec("(a|(bc)){0,0}?xyz", "xyz", "s"); + expect(match.matches[0]).toBe("xyz".substring(0, 3)); +}); xit("line: 1226 - back references are not supported", () => {}); xit("line: 1227 - back references are not supported", () => {}); xit("line: 1228 - back references are not supported", () => {}); @@ -1617,8 +1679,26 @@ it("line: 1267 - matches [^az] against 'aaAabcd '", () => { expect(match.matches[0]).toBe("aaAabcd ".substring(4, 5)); }); xit("line: 1268 - back references are not supported", () => {}); -xit("line: 1269 - lazy range repitition quantifiers are not supported", () => {}); -xit("line: 1270 - lazy range repitition quantifiers are not supported", () => {}); +it("line: 1269 - matches P[^*]TAIRE[^*]{1,6}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => { + const match = exec( + "P[^*]TAIRE[^*]{1,6}?LL", + "xxxxxxxxxxxPSTAIREISLLxxxxxxxxx", + "s" + ); + expect(match.matches[0]).toBe( + "xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22) + ); +}); +it("line: 1270 - matches P[^*]TAIRE[^*]{1,}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => { + const match = exec( + "P[^*]TAIRE[^*]{1,}?LL", + "xxxxxxxxxxxPSTAIREISLLxxxxxxxxx", + "s" + ); + expect(match.matches[0]).toBe( + "xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22) + ); +}); it("line: 1271 - matches (\\.\\d\\d[1-9]?)\\d+ against '1.230003938'", () => { const match = exec("(\\.\\d\\d[1-9]?)\\d+", "1.230003938", "s"); expect(match.matches[0]).toBe("1.230003938".substring(1, 11)); diff --git a/assembly/__tests__/quantifiers.spec.ts b/assembly/__tests__/quantifiers.spec.ts index 3e95a9f..b9c31a0 100644 --- a/assembly/__tests__/quantifiers.spec.ts +++ b/assembly/__tests__/quantifiers.spec.ts @@ -51,10 +51,10 @@ describe("non-greedy", () => { expect(match.matches[0]).toStrictEqual("ab"); }); - it("zero or one supports non-greedy mode", () => { - expectMatch("a?", ["a"]); - let match = exec("a?", "bc"); - expect(match).not.toBeNull(); - expect(match.matches[0]).toStrictEqual(""); - }); + // it("zero or one supports non-greedy mode", () => { + // expectMatch("a?", ["a"]); + // let match = exec("a??", "bc"); + // expect(match).not.toBeNull(); + // expect(match.matches[0]).toStrictEqual(""); + // }); }); diff --git a/assembly/__tests__/range-quantifiers.spec.ts b/assembly/__tests__/range-quantifiers.spec.ts index c4e93fe..76df86b 100644 --- a/assembly/__tests__/range-quantifiers.spec.ts +++ b/assembly/__tests__/range-quantifiers.spec.ts @@ -40,6 +40,11 @@ it("handles nested quantifiers", () => { expectMatch("(a{3}){2}", ["aaaaaa"]); }); +it("handles nongreedy quantifiers", () => { + const match = exec("a{2,4}?", "aaaaaaaaaa"); + expect(match.matches[0]).toBe("aa"); +}); + it("throws if quantifying a quantifier!", () => { expect(() => { let foo = new RegExp("a{3}{2}"); diff --git a/assembly/nfa/nfa.ts b/assembly/nfa/nfa.ts index a44fcf3..dda5912 100644 --- a/assembly/nfa/nfa.ts +++ b/assembly/nfa/nfa.ts @@ -134,11 +134,16 @@ function closure(nfa: Automata, greedy: bool): Automata { return new Automata(start, end); } -function zeroOrOne(nfa: Automata): Automata { +function zeroOrOne(nfa: Automata, greedy: bool): Automata { const start = new State(); const end = new State(); - start.transitions.push(nfa.start); - start.transitions.push(end); + if (greedy) { + start.transitions.push(nfa.start); + start.transitions.push(end); + } else { + start.transitions.push(end); + start.transitions.push(nfa.start); + } nfa.end.transitions.push(end); return new Automata(start, end); } @@ -182,7 +187,7 @@ class AutomataFactor { const automata = this.automataForNode(node.expression); const quantifier = node.quantifier; if (quantifier == Char.Question) { - return zeroOrOne(automata); + return zeroOrOne(automata, node.greedy); } else if (quantifier == Char.Plus) { return oneOrMore(automata, node.greedy); } else if (quantifier == Char.Asterisk) { diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts index 00d42ac..a6af920 100644 --- a/assembly/parser/node.ts +++ b/assembly/parser/node.ts @@ -155,7 +155,12 @@ export class RepetitionNode extends Node { } export class RangeRepetitionNode extends Node { - constructor(public expression: Node, public from: i32, public to: i32) { + constructor( + public expression: Node, + public from: i32, + public to: i32, + public greedy: bool = true + ) { super(NodeType.RangeRepetition); if (expression.type == NodeType.RangeRepetition) { throw new Error("The preceding token is not quantifiable"); diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index bcc1fa9..11d0c39 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -236,7 +236,14 @@ export class Parser { const range = this.maybeParseRepetitionRange(); if (range != null) { const expression = nodes.pop(); - nodes.push(new RangeRepetitionNode(expression, range.from, range.to)); + let greedy = true; + if (this.iterator.current == Char.Question) { + greedy = false; + this.eatToken(); + } + nodes.push( + new RangeRepetitionNode(expression, range.from, range.to, greedy) + ); } else { // this is not the start of a repetition, it's just a char! nodes.push(this.parseCharacter()); diff --git a/assembly/parser/string-iterator.ts b/assembly/parser/string-iterator.ts index fbd49b7..7e75996 100644 --- a/assembly/parser/string-iterator.ts +++ b/assembly/parser/string-iterator.ts @@ -13,6 +13,7 @@ export class StringIterator { next(): bool { this.cursor++; if (this.cursor >= u32(this.sourceString.length)) { + this.current = -1; return false; } this.current = this.sourceString.charCodeAt(this.cursor); diff --git a/assembly/parser/walker.ts b/assembly/parser/walker.ts index 016675b..42a2242 100644 --- a/assembly/parser/walker.ts +++ b/assembly/parser/walker.ts @@ -81,12 +81,24 @@ export function expandRepetitions(visitor: NodeVisitor): void { if (rangeRepNode.to == -1) { // a{4,} => aaaaa* - clones.push(new RepetitionNode(expression.clone(), Char.Asterisk)); + clones.push( + new RepetitionNode( + expression.clone(), + Char.Asterisk, + rangeRepNode.greedy + ) + ); } else { // a{4,6} => aaaaa?a? const count = rangeRepNode.to - rangeRepNode.from; for (let i = 0; i < count; i++) { - clones.push(new RepetitionNode(expression.clone(), Char.Question)); + clones.push( + new RepetitionNode( + expression.clone(), + Char.Question, + rangeRepNode.greedy + ) + ); } } diff --git a/spec/test-generator.js b/spec/test-generator.js index c46edbd..aaee983 100644 --- a/spec/test-generator.js +++ b/spec/test-generator.js @@ -22,6 +22,12 @@ const knownIssues = { ...range(63, 68), 1391, 1392, + ...range(52, 55), + 57, + 58, + 72, + 73, + 78, ], "lazy quantifiers should still yield the longest overall regex match": [ ...range(141, 143), @@ -109,10 +115,10 @@ lines.forEach((line, index) => { return; } - if (["}?"].some((f) => regex.includes(f))) { - testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`; - return; - } + // if (["}?"].some((f) => regex.includes(f))) { + // testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`; + // return; + // } if (["(?"].some((f) => regex.includes(f))) { testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`; diff --git a/ts/index.ts b/ts/index.ts index ff62ac4..f022ad9 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,7 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp(".*?"); -const match = regexObj.exec("abc"); +const regexObj = new RegExp("a?"); +const match = regexObj.exec("a"); console.log(match); From c42d678222049ee1d36de1d6ee0fcce23add35e9 Mon Sep 17 00:00:00 2001 From: Colin E Date: Fri, 5 Feb 2021 16:10:24 +0000 Subject: [PATCH 2/4] refactor: minor parser improvements --- assembly/parser/parser.ts | 104 ++++++++++++++++---------------------- ts/index.ts | 4 +- 2 files changed, 46 insertions(+), 62 deletions(-) diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index 11d0c39..98dcfdc 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -74,8 +74,7 @@ function isSpecialCharacter(code: u32): bool { } class Range { - from: i32 = -1; - to: i32 = -1; + constructor(public from: i32, public to: i32) {} } export class Parser { @@ -157,64 +156,54 @@ export class Parser { return new CharacterNode(this.eatToken()); } - private maybeParseRepetitionRange(): Range | null { - // snapshot - const iteratorCopy = this.iterator.copy(); - this.eatToken(Char.LeftCurlyBrace); - - let range = new Range(); - - let firstDigit = true; + private maybeParseDigit(): i32 { let digitStr = ""; while (this.iterator.more()) { const token = this.iterator.current; - if (token == Char.RightParenthesis) break; - if (firstDigit) { - if (isDigit(token)) { - // if it is a digit, keep eating - digitStr += this.iterator.currentAsString(); - } else { - range.from = digitStr.length ? parseInt(digitStr) : -1; - range.to = range.from; - if (token == Char.Comma) { - // if we meet a comma, start parsing the next digit - firstDigit = false; - digitStr = ""; - range.to = -1; - } else if (token == Char.RightCurlyBrace) { - this.eatToken(Char.RightCurlyBrace); - // close brace, this is a single value range - return range; - } else { - // anything else, we got a problem - break; - } - } + if (isDigit(token)) { + digitStr += this.iterator.currentAsString(); } else { - if (isDigit(token)) { - // if it is a digit, keep eating - digitStr += this.iterator.currentAsString(); - } else { - range.to = digitStr.length ? parseInt(digitStr) : -1; - if (token == Char.RightCurlyBrace) { - this.eatToken(Char.RightCurlyBrace); - // close brace, end of range - return range; - } else { - // anything else, we got a problem - break; - } - } + return digitStr == "" ? -1 : parseInt(digitStr); } this.eatToken(); } + return digitStr == "" ? -1 : parseInt(digitStr); + } - // repetition not found - reset state - this.iterator = iteratorCopy; + private maybeParseRepetitionRange(): Range | null { + // snapshot + const iteratorCopy = this.iterator.copy(); + this.eatToken(Char.LeftCurlyBrace); + + const from = this.maybeParseDigit(); + if (from == -1) { + return null; + } + if (this.iterator.current == Char.RightCurlyBrace) { + this.eatToken(); + return new Range(from, from); + } else if (this.iterator.current == Char.Comma) { + this.eatToken(); + const to = this.maybeParseDigit(); + // @ts-ignore + if (this.iterator.current == Char.RightCurlyBrace) { + this.eatToken(); + return new Range(from, to); + } + } + this.iterator = iteratorCopy; return null; } + private isGreedy(): bool { + if (this.iterator.current == Char.Question) { + this.eatToken(); + return false; + } + return true; + } + // parses a sequence of chars private parseSequence(): Node { let nodes = new Array(); @@ -236,13 +225,13 @@ export class Parser { const range = this.maybeParseRepetitionRange(); if (range != null) { const expression = nodes.pop(); - let greedy = true; - if (this.iterator.current == Char.Question) { - greedy = false; - this.eatToken(); - } nodes.push( - new RangeRepetitionNode(expression, range.from, range.to, greedy) + new RangeRepetitionNode( + expression, + range.from, + range.to, + this.isGreedy() + ) ); } else { // this is not the start of a repetition, it's just a char! @@ -251,12 +240,7 @@ export class Parser { } else if (isQuantifier(token)) { const expression = nodes.pop(); const quantifier = this.eatToken(); - let greedy = true; - if (this.iterator.current == Char.Question) { - greedy = false; - this.eatToken(); - } - nodes.push(new RepetitionNode(expression, quantifier, greedy)); + nodes.push(new RepetitionNode(expression, quantifier, this.isGreedy())); // @ts-ignore } else if (token == Char.LeftSquareBracket) { nodes.push(this.parseCharacterSet()); diff --git a/ts/index.ts b/ts/index.ts index f022ad9..55517e6 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,7 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("a?"); -const match = regexObj.exec("a"); +const regexObj = new RegExp("ba{0}b"); +const match = regexObj.exec("bb"); console.log(match); From 4872cf6ad224d4c234f1c350e9eb0841a9ff78ff Mon Sep 17 00:00:00 2001 From: Colin E Date: Sun, 7 Feb 2021 19:52:38 +0000 Subject: [PATCH 3/4] fix: walker was visiting states multiple times --- assembly/__tests__/capture-group.spec.ts | 6 ++++++ assembly/nfa/walker.ts | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/assembly/__tests__/capture-group.spec.ts b/assembly/__tests__/capture-group.spec.ts index e8ff9c0..ee4fe4a 100644 --- a/assembly/__tests__/capture-group.spec.ts +++ b/assembly/__tests__/capture-group.spec.ts @@ -33,3 +33,9 @@ it("should not return captured values for non-matching alternations", () => { expect(match.matches[1]).toBe(""); expect(match.matches[2]).toBe("b"); }); + +it("repeated capture groups should return the last match", () => { + const match = exec("([a-c])+", "ac"); + expect(match.matches[0]).toBe("ac"); + expect(match.matches[1]).toBe("c"); +}); diff --git a/assembly/nfa/walker.ts b/assembly/nfa/walker.ts index 276d123..9404f33 100644 --- a/assembly/nfa/walker.ts +++ b/assembly/nfa/walker.ts @@ -5,8 +5,8 @@ export function walker( visitor: (state: State) => void, visited: State[] = [] ): void { - visitor(state); if (visited.includes(state)) return; + visitor(state); visited.push(state); const nextStates = state.transitions; for (let i = 0, len = nextStates.length; i < len; i++) { From bead49eb94e9176d55152c967717816d9e9c8c09 Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 8 Feb 2021 17:57:29 +0000 Subject: [PATCH 4/4] fix: harmonise TS and AS execution ref #17 --- .prettierignore | 2 ++ assembly/char.ts | 50 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 13 deletions(-) create mode 100644 .prettierignore diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..b3decf7 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,2 @@ +# prettier doesn't support decorators on functions :-( +assembly/char.ts \ No newline at end of file diff --git a/assembly/char.ts b/assembly/char.ts index ca57e9f..1d1d0f9 100644 --- a/assembly/char.ts +++ b/assembly/char.ts @@ -5,6 +5,7 @@ export const enum Char { FormFeed = 0x0c, CarriageReturn = 0x0d, LineFeed = 0x0a, + Space = 0x20, Dollar = 0x24, // "$" LeftParenthesis = 0x28, RightParenthesis = 0x29, @@ -14,11 +15,13 @@ export const enum Char { Minus = 0x2d, // "-" Dot = 0x2e, // "." Zero = 0x30, + Nine = 0x39, Question = 0x3f, // "?" A = 0x41, D = 0x44, S = 0x53, W = 0x57, + Z = 0x5a, LeftSquareBracket = 0x5b, // "[" Backslash = 0x5c, // "\" RightSquareBracket = 0x5d, // "]" @@ -35,40 +38,58 @@ export const enum Char { v = 0x76, w = 0x77, x = 0x78, + z = 0x7a, LeftCurlyBrace = 0x7b /* { */, VerticalBar = 0x7c /* | */, - RightCurlyBrace = 0x7d /* */, + RightCurlyBrace = 0x7d /* { */, + NonBreakingSpace = 0xa0, +} + +// @ts-ignore +@inline +function inRange(value: u32, from: u32, to: u32): bool { + if (ASC_TARGET == 1) { + // makes use of unsigned integer operations, making this + // approach a little faster when compiled to WASM + return value - from < (to - from + 1); + } else { + return value >= from && value <= to; + } } export function isDigit(code: u32): bool { - return code - Char.Zero < 10; + return inRange(code, Char.Zero, Char.Nine); } export function isHexadecimalDigit(code: u32): bool { - return isDigit(code) || code - Char.a < 6; + return isDigit(code) || inRange(code, Char.a, Char.f); } export function isLowercaseAlpha(code: u32): bool { - return code - Char.a < 26; + return inRange(code, Char.a, Char.z); } export function isUppercaseAlpha(code: u32): bool { - return code - Char.A < 26; + return inRange(code, Char.A, Char.Z); } export function isAlpha(code: u32): bool { - return (code | 32) - Char.a < 26; + if (ASC_TARGET == 1) { + return (code | 32) - Char.a < 26; + } else { + return inRange(code, Char.a, Char.z) || inRange(code, Char.A, Char.Z); + } } export function isWhitespace(code: u32): bool { - if (code < 0x1680) { - // < (1) - // , , , , , and - // @ts-ignore: cast - return ((code | 0x80) == 0xa0) | (code - 0x09 <= 0x0d - 0x09); - } - if (code - 0x2000 <= 0x200a - 0x2000) return true; switch (code) { + case Char.Space: + case Char.HorizontalTab: + case Char.VerticalTab: + case Char.FormFeed: + case Char.LineFeed: + case Char.CarriageReturn: + case Char.NonBreakingSpace: case 0x1680: // (1) case 0x2028: // (2) case 0x2029: // @@ -78,5 +99,8 @@ export function isWhitespace(code: u32): bool { case 0xfeff: return true; // } + if (inRange(code, 0x2000, 0x200a)) { + return true; + } return false; }