From bd20642ad15894a1b0725bb12a54dfffd76c7e88 Mon Sep 17 00:00:00 2001 From: Drew Keller Date: Thu, 24 Sep 2020 22:00:40 -0500 Subject: [PATCH 1/5] Enhance existing test assertions Centralize list of supported characters. --- src/constants/index.ts | 17 ++++++++++++++ src/expanders/character-class-pattern.ts | 12 ++-------- src/pattern.spec.ts | 30 ++++++++++++++++++------ 3 files changed, 42 insertions(+), 17 deletions(-) create mode 100644 src/constants/index.ts diff --git a/src/constants/index.ts b/src/constants/index.ts new file mode 100644 index 0000000..13d5d10 --- /dev/null +++ b/src/constants/index.ts @@ -0,0 +1,17 @@ +const whitespace = ` \t\r\n`.split(''); + +const digits = '0123456789'.split(''); + +const basicLowercase = 'abcdefghijklmnopqrstuvwxyz'.split(''); +const basicUppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split(''); +const basicSpecial = '~`!@#$%^&*()-_=+<,>.?/[]{}|\\:;"\''.split(''); + +const all = ([] as string[]).concat( + whitespace, + digits, + basicLowercase, + basicUppercase, + basicSpecial +); + +export const Chars = { all }; diff --git a/src/expanders/character-class-pattern.ts b/src/expanders/character-class-pattern.ts index 1b8888a..0502770 100644 --- a/src/expanders/character-class-pattern.ts +++ b/src/expanders/character-class-pattern.ts @@ -1,4 +1,5 @@ import { CharacterClass } from 'regexp-tree/ast'; +import { Chars } from '../constants'; import Expander from '../Expander'; import Expansion from '../Expansion'; import sortRandom from '../sorts/fisher-yates-random'; @@ -18,16 +19,7 @@ function getReferencedCodePoints( return [expression.codePoint]; } -const allCharOptions = - ' \t\r\n' + - 'abcdefghijklmnopqrstuvwxyz' + - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + - '0123456789' + - '~`!@#$%^&*()-_=+<,>.?/[]{}|\\:;"\''; - -const allCodePointOptions = allCharOptions - .split('') - .map(char => char.charCodeAt(0)); +const allCodePointOptions = Chars.all.map(char => char.charCodeAt(0)); /** * Expand an expression which represents a single character from a diff --git a/src/pattern.spec.ts b/src/pattern.spec.ts index 7efac14..8fe1cb1 100644 --- a/src/pattern.spec.ts +++ b/src/pattern.spec.ts @@ -1,4 +1,5 @@ import { when } from 'jest-when'; +import { Chars } from './constants'; import Expansion from './Expansion'; import { fill } from './helpers/utils'; import * as patternLib from './pattern'; @@ -363,7 +364,7 @@ describe('expand', () => { } const result = expandAll('[^abc]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(95); result.forEach(testExpansion); }); @@ -374,7 +375,7 @@ describe('expand', () => { } const result = expandAll('[^246]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(95); result.forEach(testExpansion); }); @@ -385,7 +386,7 @@ describe('expand', () => { } const result = expandAll('[^a-p]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(82); result.forEach(testExpansion); }); @@ -396,7 +397,7 @@ describe('expand', () => { } const result = expandAll('[^0-8]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(89); result.forEach(testExpansion); }); @@ -407,7 +408,7 @@ describe('expand', () => { } const result = expandAll('[^aeiou0-5A-T]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(67); result.forEach(testExpansion); }); @@ -438,10 +439,14 @@ describe('expand', () => { it.each([/./, /\w/, /\W/, /\d/, /\D/, /\s/, /\S/])( 'expands the single character class %p', (charClass: RegExp) => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(charClass); + } + const result = expandAll(charClass); expect(result.length).toBeGreaterThan(1); - expect(result[0]).toHaveLength(1); - expect(result[0]).toMatch(charClass); + result.forEach(testExpansion); } ); @@ -562,6 +567,17 @@ describe('expand', () => { } ); + it.each([/(.|\r)/s, /[\s\S]/])( + 'includes all supported characters in %p', + regex => { + const result = expandAll(regex); + + Chars.all.forEach(char => { + expect(result).toContain(char); + }); + } + ); + it('expands repeated character class', () => { const allTwoDigitNumbers = fill(0, 99).map(num => num.toString().padStart(2, '0') From 5987222d0e05a3a3491e08956bd52eaa971a4a9d Mon Sep 17 00:00:00 2001 From: Drew Keller Date: Thu, 24 Sep 2020 22:18:21 -0500 Subject: [PATCH 2/5] Support extended ASCII characters Include Windows-1252 extended ASCII characters in character classes and sets as listed at https://www.ascii-code.com/#extendedASCIIDescription. --- src/constants/index.ts | 18 ++- src/pattern.spec.ts | 151 +++++++++++++++--- .../meta-to-char-class-transform.ts | 24 ++- 3 files changed, 169 insertions(+), 24 deletions(-) diff --git a/src/constants/index.ts b/src/constants/index.ts index 13d5d10..9f7de09 100644 --- a/src/constants/index.ts +++ b/src/constants/index.ts @@ -1,4 +1,5 @@ -const whitespace = ` \t\r\n`.split(''); +const nbsp = String.fromCharCode(160); +const whitespace = ` \t\r\n${nbsp}`.split(''); const digits = '0123456789'.split(''); @@ -6,12 +7,25 @@ const basicLowercase = 'abcdefghijklmnopqrstuvwxyz'.split(''); const basicUppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split(''); const basicSpecial = '~`!@#$%^&*()-_=+<,>.?/[]{}|\\:;"\''.split(''); +const extendedLowercase = 'àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'.split(''); +const extendedUppercase = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß'.split(''); +const shy = String.fromCharCode(173); +const extendedSpecial = `¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×÷${shy}`.split(''); + +// Special Windows-1252 display characters in the extended ASCII range +// https://www.ascii-code.com/#extendedASCIIDescription +const windows1252Special = '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ'.split(''); + const all = ([] as string[]).concat( whitespace, digits, basicLowercase, basicUppercase, - basicSpecial + basicSpecial, + extendedLowercase, + extendedUppercase, + extendedSpecial, + windows1252Special ); export const Chars = { all }; diff --git a/src/pattern.spec.ts b/src/pattern.spec.ts index 8fe1cb1..4abb517 100644 --- a/src/pattern.spec.ts +++ b/src/pattern.spec.ts @@ -150,13 +150,13 @@ describe('expand', () => { ); it('reproduces static patterns', () => { - const result = expandAll('abc'); - expect(result).toEqual(['abc']); + const result = expandAll('abcáï®'); + expect(result).toEqual(['abcáï®']); }); it('reproduces static alternation patterns', () => { - const result = expandAll('abc|xyz'); - expect(result).toEqual(['abc', 'xyz']); + const result = expandAll('abc†|xyz‡'); + expect(result).toEqual(['abc†', 'xyz‡']); }); it('expands single-character groups', () => { @@ -165,8 +165,8 @@ describe('expand', () => { }); it('expands multi-character groups', () => { - const result = expandAll('foo(bar)'); - expect(result).toEqual(['foobar']); + const result = expandAll('foo(bar½)'); + expect(result).toEqual(['foobar½']); }); it('expands single-character alternation groups', () => { @@ -175,8 +175,8 @@ describe('expand', () => { }); it('expands multi-character alternation groups', () => { - const result = expandAll('b(ar|az)'); - expect(result).toEqual(['bar', 'baz']); + const result = expandAll('b(ar†|az‡)'); + expect(result).toEqual(['bar†', 'baz‡']); }); it('expands nested alternation groups', () => { @@ -250,6 +250,14 @@ describe('expand', () => { } ); + it.each([/ab…+/, /ab…+?/])( + 'expands repeating extended ASCII character %p', + (repeat: RegExp) => { + const result = expandN(repeat, 5); + expect(result).toEqual(['ab…', 'ab……', 'ab………', 'ab…………', 'ab……………']); + } + ); + it('expands alphabetic single-character set', () => { const result = expandAll('[aeiou]'); expect(result).toEqual(['a', 'e', 'i', 'o', 'u']); @@ -260,6 +268,11 @@ describe('expand', () => { expect(result).toEqual(['2', '3', '4', '7', '8', '9']); }); + it('expands extended ASCII single-character set', () => { + const result = expandAll('[ÁÉÍÓÚÝ]'); + expect(result).toEqual(['Á', 'É', 'Í', 'Ó', 'Ú', 'Ý']); + }); + it('expands alphabetic range character set', () => { const result = expandAll('[a-f]'); expect(result).toEqual(['a', 'b', 'c', 'd', 'e', 'f']); @@ -270,6 +283,11 @@ describe('expand', () => { expect(result).toEqual(['0', '1', '2', '3', '4', '5']); }); + it('expands extended ASCII range character set', () => { + const result = expandAll('[À-Æ]'); + expect(result).toEqual(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ']); + }); + it('expands permutations of sibling character sets', () => { const result = expandAll('[ab]c[de]f'); expect(result).toEqual(['acdf', 'acef', 'bcdf', 'bcef']); @@ -364,7 +382,7 @@ describe('expand', () => { } const result = expandAll('[^abc]'); - expect(result).toHaveLength(95); + expect(result).toHaveLength(218); result.forEach(testExpansion); }); @@ -375,7 +393,18 @@ describe('expand', () => { } const result = expandAll('[^246]'); - expect(result).toHaveLength(95); + expect(result).toHaveLength(218); + result.forEach(testExpansion); + }); + + it('expands negated extended ASCII character set', () => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(/[^ÁÉÍÓÚÝ]/); + } + + const result = expandAll('[^ÁÉÍÓÚÝ]'); + expect(result).toHaveLength(215); result.forEach(testExpansion); }); @@ -386,7 +415,7 @@ describe('expand', () => { } const result = expandAll('[^a-p]'); - expect(result).toHaveLength(82); + expect(result).toHaveLength(205); result.forEach(testExpansion); }); @@ -397,18 +426,29 @@ describe('expand', () => { } const result = expandAll('[^0-8]'); - expect(result).toHaveLength(89); + expect(result).toHaveLength(212); + result.forEach(testExpansion); + }); + + it('expands negated extended ASCII range character set', () => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(/[^À-Æ]/); + } + + const result = expandAll('[^À-Æ]'); + expect(result).toHaveLength(214); result.forEach(testExpansion); }); it('expands negated character set with multiple ranges', () => { function testExpansion(expansion: string) { expect(expansion).toHaveLength(1); - expect(expansion).toMatch(/[^aeiou0-5A-T]/); + expect(expansion).toMatch(/[^aeiou0-5A-Tð-ö]/); } - const result = expandAll('[^aeiou0-5A-T]'); - expect(result).toHaveLength(67); + const result = expandAll('[^aeiou0-5A-Tð-ö]'); + expect(result).toHaveLength(183); result.forEach(testExpansion); }); @@ -495,8 +535,9 @@ describe('expand', () => { it.each([/\w\w\w/, /\w\d\s/, /\W\w\w/, /\W\D\S/, /\s\w\S/, /\d\W\D/])( 'expands the multiple character class %p', (charClassSet: RegExp) => { - const result = expandAll(charClassSet); - expect(result.length).toBeGreaterThan(1); + // Too many possible combinations - limit to 1,000 + const result = expandN(charClassSet, 1000); + expect(result).toHaveLength(1000); expect(result[0]).toHaveLength(3); expect(result[0]).toMatch(charClassSet); } @@ -733,7 +774,7 @@ describe('expand', () => { }); it('is performant', () => { - const trial = () => expand(/([ab]|(c|[d-e]){2,3})(\w?) \1/); + const trial = () => expand(/([ab]|(c|[ù-ü]){2,3})(\w?) \1/); const averageTime = measureAverageTime(trial, 5); expect(averageTime).toBeLessThanOrEqual(10); }); @@ -766,7 +807,7 @@ describe('expand', () => { [/a{0,5}/, ['', 'a', 'aa', 'aaa', 'aaaa', 'aaaaa']], [/[ab]{3}/, ['aaa', 'aab', 'aba', 'abb', 'baa', 'bab', 'bba', 'bbb']], [/(a|b|c|d|e|f|g)/, ['a', 'b', 'c', 'd', 'e', 'f', 'g']], - [/aAa/i, ['aaa', 'aaA', 'aAa', 'Aaa', 'aAA', 'AaA', 'AAa', 'AAA']], + [/aAä/i, ['aaä', 'aaÄ', 'aAä', 'Aaä', 'aAÄ', 'AaÄ', 'AAä', 'AAÄ']], [/[A-I]/, ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']], ])( 'sorts patterns without losing accuracy: %p', @@ -820,6 +861,14 @@ describe('expand', () => { } ); + it.each([/àÑ/, /\340\321/, /\xE0\xD1/, /\u00E0\u00D1/])( + 'expands extended ASCII exact casing when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['àÑ']); + } + ); + it.each([/aB/i, /\141\102/i, /\x61\x42/i, /\u0061\u0042/i])( 'expands casing variants when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -828,6 +877,14 @@ describe('expand', () => { } ); + it.each([/àÑ/i, /\340\321/i, /\xE0\xD1/i, /\u00E0\u00D1/i])( + 'expands extended ASCII casing variants when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['àñ', 'àÑ', 'Àñ', 'ÀÑ']); + } + ); + it.each([/4%/i, /\64\45/i, /\x34\x25/i, /\u0034\u0025/i])( 'does not expand uncased characters when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -836,6 +893,14 @@ describe('expand', () => { } ); + it.each([/©×/i, /\251\327/i, /\xA9\xD7/i, /\u00A9\u00D7/i])( + 'does not expand uncased extended ASCII characters when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['©×']); + } + ); + it.each([/[aB]/, /[\141\102]/, /[\x61\x42]/, /[\u0061\u0042]/])( 'expands exact casing in static set when the case-insensitive flag is omitted: %p', (input: RegExp) => { @@ -844,6 +909,14 @@ describe('expand', () => { } ); + it.each([/[àÑ]/, /[\340\321]/, /[\xE0\xD1]/, /[\u00E0\u00D1]/])( + 'expands extended ASCII exact casing in static set when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['à', 'Ñ']); + } + ); + it.each([/[aB]/i, /[\141\102]/i, /[\x61\x42]/i, /[\u0061\u0042]/i])( 'expands casing variants in static set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -852,6 +925,14 @@ describe('expand', () => { } ); + it.each([/[àÑ]/i, /[\340\321]/i, /[\xE0\xD1]/i, /[\u00E0\u00D1]/i])( + 'expands extended ASCII casing variants in static set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['à', 'À', 'ñ', 'Ñ']); + } + ); + it.each([/[4%]/i, /[\64\45]/i, /[\x34\x25]/i, /[\u0034\u0025]/i])( 'does not expand uncased characters in static set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -860,6 +941,14 @@ describe('expand', () => { } ); + it.each([/[©×]/i, /[\251\327]/i, /[\xA9\xD7]/i, /[\u00A9\u00D7]/i])( + 'does not expand uncased extended ASCII characters in static set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['©', '×']); + } + ); + it.each([/[a-d]/, /[\141-\144]/, /[\x61-\x64]/, /[\u0061-\u0064]/])( 'expands exact casing in range set when the case-insensitive flag is omitted: %p', (input: RegExp) => { @@ -868,6 +957,14 @@ describe('expand', () => { } ); + it.each([/[Ì-Ï]/, /[\314-\317]/, /[\xCC-\xCF]/, /[\u00CC-\u00CF]/])( + 'expands extended ASCII exact casing in range set when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['Ì', 'Í', 'Î', 'Ï']); + } + ); + it.each([/[a-d]/i, /[\141-\144]/i, /[\x61-\x64]/i, /[\u0061-\u0064]/i])( 'expands casing variants in range set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -876,6 +973,14 @@ describe('expand', () => { } ); + it.each([/[Ì-Ï]/i, /[\314-\317]/i, /[\xCC-\xCF]/i, /[\u00CC-\u00CF]/i])( + 'expands extended ASCII casing variants in range set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['ì', 'Ì', 'í', 'Í', 'î', 'Î', 'ï', 'Ï']); + } + ); + it.each([/[1-4]/i, /[\61-\64]/i, /[\x31-\x34]/i, /[\u0031-\u0034]/i])( 'does not expand uncased characters in range set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -884,6 +989,14 @@ describe('expand', () => { } ); + it.each([/[¼-¿]/i, /[\274-\277]/i, /[\xBC-\xBF]/i, /[\u00BC-\u00BF]/i])( + 'does not expand extended ASCII uncased characters in range set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['¼', '½', '¾', '¿']); + } + ); + it.each(['.', /./])( 'does not expand the dot character to a newline when the dotall flag is omitted %#', (input: string | RegExp) => { diff --git a/src/transforms/meta-to-char-class-transform.ts b/src/transforms/meta-to-char-class-transform.ts index db04b5e..a271015 100644 --- a/src/transforms/meta-to-char-class-transform.ts +++ b/src/transforms/meta-to-char-class-transform.ts @@ -62,7 +62,10 @@ const replacer: NodeReplacer = { const optionsAlpha = [createClassRange('a', 'z'), createClassRange('A', 'Z')]; const optionsDigit = createClassRange('0', '9'); const optionUnderscore = createEscapedSimpleChar('_'); -const optionsWhitespaceNoBreak = createSimpleChars(' \t'); +const optionsWhitespaceNoBreak = [ + ...createSimpleChars(' \t'), + createSimpleChar(String.fromCharCode(160)), //   +]; const optionsWhitespace = [ ...optionsWhitespaceNoBreak, ...createSimpleChars('\r\n'), @@ -74,6 +77,13 @@ const optionsOther = [ createEscapedSimpleChar('\\'), ]; const optionsNewLine = createSimpleChar('\n'); +const optionsExtendedAscii = [ + ...createSimpleChars('àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'), + ...createSimpleChars('ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß'), + ...createSimpleChars('¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿'), + ...createSimpleChars('€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ×÷'), + createSimpleChar(String.fromCharCode(173)), // ­ +]; function getMetaCharExpressions( metaChar: SpecialChar, @@ -89,13 +99,14 @@ function getMetaCharExpressions( ...optionsWhitespaceNoBreak, ...optionsOther, optionUnderscore, + ...optionsExtendedAscii, ...dotAllOptions, ]; } case '\\w': return [...optionsAlpha, optionsDigit, optionUnderscore]; case '\\W': - return [...optionsWhitespace, ...optionsOther]; + return [...optionsWhitespace, ...optionsOther, ...optionsExtendedAscii]; case '\\d': return [optionsDigit]; case '\\D': @@ -104,11 +115,18 @@ function getMetaCharExpressions( ...optionsWhitespace, ...optionsOther, optionUnderscore, + ...optionsExtendedAscii, ]; case '\\s': return optionsWhitespace; case '\\S': - return [...optionsAlpha, optionsDigit, ...optionsOther, optionUnderscore]; + return [ + ...optionsAlpha, + optionsDigit, + ...optionsOther, + optionUnderscore, + ...optionsExtendedAscii, + ]; default: return []; } From c3509ddfed9a7fd362dc29bfd4abd87e8680def5 Mon Sep 17 00:00:00 2001 From: Drew Keller Date: Fri, 25 Sep 2020 21:24:44 -0500 Subject: [PATCH 3/5] Reuse Char constants in meta transformer --- src/constants/index.ts | 39 +++++++---- .../meta-to-char-class-transform.ts | 70 ++++++++----------- src/transforms/utils.ts | 14 +--- 3 files changed, 55 insertions(+), 68 deletions(-) diff --git a/src/constants/index.ts b/src/constants/index.ts index 9f7de09..89eccff 100644 --- a/src/constants/index.ts +++ b/src/constants/index.ts @@ -3,9 +3,12 @@ const whitespace = ` \t\r\n${nbsp}`.split(''); const digits = '0123456789'.split(''); +const underscore = '_'; const basicLowercase = 'abcdefghijklmnopqrstuvwxyz'.split(''); const basicUppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split(''); -const basicSpecial = '~`!@#$%^&*()-_=+<,>.?/[]{}|\\:;"\''.split(''); +const basicAlpha = [...basicLowercase, ...basicUppercase, underscore]; + +const basicSpecial = '~`!@#$%^&*()-=+<,>.?/[]{}|\\:;"\''.split(''); const extendedLowercase = 'àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'.split(''); const extendedUppercase = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß'.split(''); @@ -16,16 +19,26 @@ const extendedSpecial = `¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼ // https://www.ascii-code.com/#extendedASCIIDescription const windows1252Special = '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ'.split(''); -const all = ([] as string[]).concat( - whitespace, - digits, - basicLowercase, - basicUppercase, - basicSpecial, - extendedLowercase, - extendedUppercase, - extendedSpecial, - windows1252Special -); +const extended = [ + ...extendedLowercase, + ...extendedUppercase, + ...extendedSpecial, + ...windows1252Special, +]; -export const Chars = { all }; +const all = [ + ...whitespace, + ...digits, + ...basicAlpha, + ...basicSpecial, + ...extended, +]; + +export const Chars = { + all, + basicAlpha, + basicSpecial, + digits, + extended, + whitespace, +}; diff --git a/src/transforms/meta-to-char-class-transform.ts b/src/transforms/meta-to-char-class-transform.ts index a271015..e6b425a 100644 --- a/src/transforms/meta-to-char-class-transform.ts +++ b/src/transforms/meta-to-char-class-transform.ts @@ -7,13 +7,9 @@ import { CharacterClass, SpecialChar, } from 'regexp-tree/ast'; +import { Chars } from '../constants'; import * as Guards from '../types/regexp-tree-guards'; -import { - createClassRange, - createEscapedSimpleChar, - createSimpleChar, - createSimpleChars, -} from './utils'; +import { createEscapedSimpleChar, createSimpleChar } from './utils'; type Replace = ( parentNode: AsExpression, @@ -59,73 +55,63 @@ const replacer: NodeReplacer = { }, }; -const optionsAlpha = [createClassRange('a', 'z'), createClassRange('A', 'Z')]; -const optionsDigit = createClassRange('0', '9'); -const optionUnderscore = createEscapedSimpleChar('_'); -const optionsWhitespaceNoBreak = [ - ...createSimpleChars(' \t'), - createSimpleChar(String.fromCharCode(160)), //   -]; -const optionsWhitespace = [ - ...optionsWhitespaceNoBreak, - ...createSimpleChars('\r\n'), -]; +const optionsAlpha = Chars.basicAlpha.map(createSimpleChar); +const optionsDigit = Chars.digits.map(createSimpleChar); + +const optionsWhitespace = Chars.whitespace.map(createSimpleChar); + +const needEscape = [']', '-', '\\']; +const noEscape = Chars.basicSpecial.filter(c => !needEscape.includes(c)); const optionsOther = [ - ...createSimpleChars('~`!@#$%^&*()=+<,>.?/[{}|:;"\''), - createEscapedSimpleChar(']'), - createEscapedSimpleChar('-'), - createEscapedSimpleChar('\\'), -]; -const optionsNewLine = createSimpleChar('\n'); -const optionsExtendedAscii = [ - ...createSimpleChars('àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'), - ...createSimpleChars('ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß'), - ...createSimpleChars('¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿'), - ...createSimpleChars('€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ×÷'), - createSimpleChar(String.fromCharCode(173)), // ­ + ...noEscape.map(createSimpleChar), + ...needEscape.map(createEscapedSimpleChar), ]; +const optionsExtended = Chars.extended.map(createSimpleChar); + function getMetaCharExpressions( metaChar: SpecialChar, regExpFlags: string ): CharacterClass['expressions'] { switch (metaChar.value) { case '.': { - const dotAllOptions = regExpFlags.includes('s') ? [optionsNewLine] : []; + const optionsNewLine = createSimpleChar('\n'); + const optionsDotAll = regExpFlags.includes('s') ? [optionsNewLine] : []; + const whitespaceNoBreaks = Chars.whitespace.filter( + c => !'\r\n'.includes(c) + ); + const optionsWhitespaceNoBreak = whitespaceNoBreaks.map(createSimpleChar); return [ ...optionsAlpha, - optionsDigit, + ...optionsDigit, ...optionsWhitespaceNoBreak, ...optionsOther, - optionUnderscore, - ...optionsExtendedAscii, - ...dotAllOptions, + ...optionsExtended, + ...optionsDotAll, ]; } case '\\w': - return [...optionsAlpha, optionsDigit, optionUnderscore]; + return [...optionsAlpha, ...optionsDigit]; case '\\W': - return [...optionsWhitespace, ...optionsOther, ...optionsExtendedAscii]; + return [...optionsWhitespace, ...optionsOther, ...optionsExtended]; case '\\d': - return [optionsDigit]; + return optionsDigit; case '\\D': return [ ...optionsAlpha, ...optionsWhitespace, ...optionsOther, - optionUnderscore, - ...optionsExtendedAscii, + ...optionsExtended, ]; case '\\s': return optionsWhitespace; case '\\S': return [ ...optionsAlpha, - optionsDigit, + ...optionsDigit, ...optionsOther, - optionUnderscore, - ...optionsExtendedAscii, + ...optionsExtended, ]; default: return []; diff --git a/src/transforms/utils.ts b/src/transforms/utils.ts index 61e91fa..fb76dca 100644 --- a/src/transforms/utils.ts +++ b/src/transforms/utils.ts @@ -1,4 +1,4 @@ -import { ClassRange, SimpleChar } from 'regexp-tree/ast'; +import { SimpleChar } from 'regexp-tree/ast'; export function createSimpleChar(value: string): SimpleChar { if (value.length !== 1) throw new Error('value must be a char'); @@ -18,15 +18,3 @@ export function createEscapedSimpleChar(value: string): SimpleChar { escaped: true, }; } - -export function createSimpleChars(values: string): SimpleChar[] { - return values.split('').map(createSimpleChar); -} - -export function createClassRange(from: string, to: string): ClassRange { - return { - from: createSimpleChar(from), - to: createSimpleChar(to), - type: 'ClassRange', - }; -} From fd8c8be78000f2eed35a92743842c51cf62852aa Mon Sep 17 00:00:00 2001 From: Drew Keller Date: Sat, 26 Sep 2020 00:19:15 -0500 Subject: [PATCH 4/5] Enhance test for all supported characters --- src/pattern.spec.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pattern.spec.ts b/src/pattern.spec.ts index 4abb517..4c7ecd3 100644 --- a/src/pattern.spec.ts +++ b/src/pattern.spec.ts @@ -608,11 +608,12 @@ describe('expand', () => { } ); - it.each([/(.|\r)/s, /[\s\S]/])( + it.each([/(.|\r)/s, /[\s\S]/, /[\w\W]/, /[\d\D]/])( 'includes all supported characters in %p', regex => { const result = expandAll(regex); + expect(result).toHaveLength(Chars.all.length); Chars.all.forEach(char => { expect(result).toContain(char); }); From 60bc0959462c99e7ecc0d823701495cca2bca390 Mon Sep 17 00:00:00 2001 From: Drew Keller Date: Sat, 26 Sep 2020 00:58:29 -0500 Subject: [PATCH 5/5] Bump major version to 2.0.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 16b1213..f58e1e7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "regex-to-strings", - "version": "1.1.0", + "version": "2.0.0", "repository": { "type": "git", "url": "git+https://github.com/wimpyprogrammer/regex-to-strings.git"