diff --git a/package.json b/package.json index 16b1213..f58e1e7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "regex-to-strings", - "version": "1.1.0", + "version": "2.0.0", "repository": { "type": "git", "url": "git+https://github.com/wimpyprogrammer/regex-to-strings.git" diff --git a/src/constants/index.ts b/src/constants/index.ts new file mode 100644 index 0000000..89eccff --- /dev/null +++ b/src/constants/index.ts @@ -0,0 +1,44 @@ +const nbsp = String.fromCharCode(160); +const whitespace = ` \t\r\n${nbsp}`.split(''); + +const digits = '0123456789'.split(''); + +const underscore = '_'; +const basicLowercase = 'abcdefghijklmnopqrstuvwxyz'.split(''); +const basicUppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split(''); +const basicAlpha = [...basicLowercase, ...basicUppercase, underscore]; + +const basicSpecial = '~`!@#$%^&*()-=+<,>.?/[]{}|\\:;"\''.split(''); + +const extendedLowercase = 'àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'.split(''); +const extendedUppercase = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß'.split(''); +const shy = String.fromCharCode(173); +const extendedSpecial = `¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×÷${shy}`.split(''); + +// Special Windows-1252 display characters in the extended ASCII range +// https://www.ascii-code.com/#extendedASCIIDescription +const windows1252Special = '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ'.split(''); + +const extended = [ + ...extendedLowercase, + ...extendedUppercase, + ...extendedSpecial, + ...windows1252Special, +]; + +const all = [ + ...whitespace, + ...digits, + ...basicAlpha, + ...basicSpecial, + ...extended, +]; + +export const Chars = { + all, + basicAlpha, + basicSpecial, + digits, + extended, + whitespace, +}; diff --git a/src/expanders/character-class-pattern.ts b/src/expanders/character-class-pattern.ts index 1b8888a..0502770 100644 --- a/src/expanders/character-class-pattern.ts +++ b/src/expanders/character-class-pattern.ts @@ -1,4 +1,5 @@ import { CharacterClass } from 'regexp-tree/ast'; +import { Chars } from '../constants'; import Expander from '../Expander'; import Expansion from '../Expansion'; import sortRandom from '../sorts/fisher-yates-random'; @@ -18,16 +19,7 @@ function getReferencedCodePoints( return [expression.codePoint]; } -const allCharOptions = - ' \t\r\n' + - 'abcdefghijklmnopqrstuvwxyz' + - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + - '0123456789' + - '~`!@#$%^&*()-_=+<,>.?/[]{}|\\:;"\''; - -const allCodePointOptions = allCharOptions - .split('') - .map(char => char.charCodeAt(0)); +const allCodePointOptions = Chars.all.map(char => char.charCodeAt(0)); /** * Expand an expression which represents a single character from a diff --git a/src/pattern.spec.ts b/src/pattern.spec.ts index 7efac14..4c7ecd3 100644 --- a/src/pattern.spec.ts +++ b/src/pattern.spec.ts @@ -1,4 +1,5 @@ import { when } from 'jest-when'; +import { Chars } from './constants'; import Expansion from './Expansion'; import { fill } from './helpers/utils'; import * as patternLib from './pattern'; @@ -149,13 +150,13 @@ describe('expand', () => { ); it('reproduces static patterns', () => { - const result = expandAll('abc'); - expect(result).toEqual(['abc']); + const result = expandAll('abcáï®'); + expect(result).toEqual(['abcáï®']); }); it('reproduces static alternation patterns', () => { - const result = expandAll('abc|xyz'); - expect(result).toEqual(['abc', 'xyz']); + const result = expandAll('abc†|xyz‡'); + expect(result).toEqual(['abc†', 'xyz‡']); }); it('expands single-character groups', () => { @@ -164,8 +165,8 @@ describe('expand', () => { }); it('expands multi-character groups', () => { - const result = expandAll('foo(bar)'); - expect(result).toEqual(['foobar']); + const result = expandAll('foo(bar½)'); + expect(result).toEqual(['foobar½']); }); it('expands single-character alternation groups', () => { @@ -174,8 +175,8 @@ describe('expand', () => { }); it('expands multi-character alternation groups', () => { - const result = expandAll('b(ar|az)'); - expect(result).toEqual(['bar', 'baz']); + const result = expandAll('b(ar†|az‡)'); + expect(result).toEqual(['bar†', 'baz‡']); }); it('expands nested alternation groups', () => { @@ -249,6 +250,14 @@ describe('expand', () => { } ); + it.each([/ab…+/, /ab…+?/])( + 'expands repeating extended ASCII character %p', + (repeat: RegExp) => { + const result = expandN(repeat, 5); + expect(result).toEqual(['ab…', 'ab……', 'ab………', 'ab…………', 'ab……………']); + } + ); + it('expands alphabetic single-character set', () => { const result = expandAll('[aeiou]'); expect(result).toEqual(['a', 'e', 'i', 'o', 'u']); @@ -259,6 +268,11 @@ describe('expand', () => { expect(result).toEqual(['2', '3', '4', '7', '8', '9']); }); + it('expands extended ASCII single-character set', () => { + const result = expandAll('[ÁÉÍÓÚÝ]'); + expect(result).toEqual(['Á', 'É', 'Í', 'Ó', 'Ú', 'Ý']); + }); + it('expands alphabetic range character set', () => { const result = expandAll('[a-f]'); expect(result).toEqual(['a', 'b', 'c', 'd', 'e', 'f']); @@ -269,6 +283,11 @@ describe('expand', () => { expect(result).toEqual(['0', '1', '2', '3', '4', '5']); }); + it('expands extended ASCII range character set', () => { + const result = expandAll('[À-Æ]'); + expect(result).toEqual(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ']); + }); + it('expands permutations of sibling character sets', () => { const result = expandAll('[ab]c[de]f'); expect(result).toEqual(['acdf', 'acef', 'bcdf', 'bcef']); @@ -363,7 +382,7 @@ describe('expand', () => { } const result = expandAll('[^abc]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(218); result.forEach(testExpansion); }); @@ -374,7 +393,18 @@ describe('expand', () => { } const result = expandAll('[^246]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(218); + result.forEach(testExpansion); + }); + + it('expands negated extended ASCII character set', () => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(/[^ÁÉÍÓÚÝ]/); + } + + const result = expandAll('[^ÁÉÍÓÚÝ]'); + expect(result).toHaveLength(215); result.forEach(testExpansion); }); @@ -385,7 +415,7 @@ describe('expand', () => { } const result = expandAll('[^a-p]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(205); result.forEach(testExpansion); }); @@ -396,18 +426,29 @@ describe('expand', () => { } const result = expandAll('[^0-8]'); - expect(result.length).toBeGreaterThan(1); + expect(result).toHaveLength(212); + result.forEach(testExpansion); + }); + + it('expands negated extended ASCII range character set', () => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(/[^À-Æ]/); + } + + const result = expandAll('[^À-Æ]'); + expect(result).toHaveLength(214); result.forEach(testExpansion); }); it('expands negated character set with multiple ranges', () => { function testExpansion(expansion: string) { expect(expansion).toHaveLength(1); - expect(expansion).toMatch(/[^aeiou0-5A-T]/); + expect(expansion).toMatch(/[^aeiou0-5A-Tð-ö]/); } - const result = expandAll('[^aeiou0-5A-T]'); - expect(result.length).toBeGreaterThan(1); + const result = expandAll('[^aeiou0-5A-Tð-ö]'); + expect(result).toHaveLength(183); result.forEach(testExpansion); }); @@ -438,10 +479,14 @@ describe('expand', () => { it.each([/./, /\w/, /\W/, /\d/, /\D/, /\s/, /\S/])( 'expands the single character class %p', (charClass: RegExp) => { + function testExpansion(expansion: string) { + expect(expansion).toHaveLength(1); + expect(expansion).toMatch(charClass); + } + const result = expandAll(charClass); expect(result.length).toBeGreaterThan(1); - expect(result[0]).toHaveLength(1); - expect(result[0]).toMatch(charClass); + result.forEach(testExpansion); } ); @@ -490,8 +535,9 @@ describe('expand', () => { it.each([/\w\w\w/, /\w\d\s/, /\W\w\w/, /\W\D\S/, /\s\w\S/, /\d\W\D/])( 'expands the multiple character class %p', (charClassSet: RegExp) => { - const result = expandAll(charClassSet); - expect(result.length).toBeGreaterThan(1); + // Too many possible combinations - limit to 1,000 + const result = expandN(charClassSet, 1000); + expect(result).toHaveLength(1000); expect(result[0]).toHaveLength(3); expect(result[0]).toMatch(charClassSet); } @@ -562,6 +608,18 @@ describe('expand', () => { } ); + it.each([/(.|\r)/s, /[\s\S]/, /[\w\W]/, /[\d\D]/])( + 'includes all supported characters in %p', + regex => { + const result = expandAll(regex); + + expect(result).toHaveLength(Chars.all.length); + Chars.all.forEach(char => { + expect(result).toContain(char); + }); + } + ); + it('expands repeated character class', () => { const allTwoDigitNumbers = fill(0, 99).map(num => num.toString().padStart(2, '0') @@ -717,7 +775,7 @@ describe('expand', () => { }); it('is performant', () => { - const trial = () => expand(/([ab]|(c|[d-e]){2,3})(\w?) \1/); + const trial = () => expand(/([ab]|(c|[ù-ü]){2,3})(\w?) \1/); const averageTime = measureAverageTime(trial, 5); expect(averageTime).toBeLessThanOrEqual(10); }); @@ -750,7 +808,7 @@ describe('expand', () => { [/a{0,5}/, ['', 'a', 'aa', 'aaa', 'aaaa', 'aaaaa']], [/[ab]{3}/, ['aaa', 'aab', 'aba', 'abb', 'baa', 'bab', 'bba', 'bbb']], [/(a|b|c|d|e|f|g)/, ['a', 'b', 'c', 'd', 'e', 'f', 'g']], - [/aAa/i, ['aaa', 'aaA', 'aAa', 'Aaa', 'aAA', 'AaA', 'AAa', 'AAA']], + [/aAä/i, ['aaä', 'aaÄ', 'aAä', 'Aaä', 'aAÄ', 'AaÄ', 'AAä', 'AAÄ']], [/[A-I]/, ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']], ])( 'sorts patterns without losing accuracy: %p', @@ -804,6 +862,14 @@ describe('expand', () => { } ); + it.each([/àÑ/, /\340\321/, /\xE0\xD1/, /\u00E0\u00D1/])( + 'expands extended ASCII exact casing when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['àÑ']); + } + ); + it.each([/aB/i, /\141\102/i, /\x61\x42/i, /\u0061\u0042/i])( 'expands casing variants when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -812,6 +878,14 @@ describe('expand', () => { } ); + it.each([/àÑ/i, /\340\321/i, /\xE0\xD1/i, /\u00E0\u00D1/i])( + 'expands extended ASCII casing variants when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['àñ', 'àÑ', 'Àñ', 'ÀÑ']); + } + ); + it.each([/4%/i, /\64\45/i, /\x34\x25/i, /\u0034\u0025/i])( 'does not expand uncased characters when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -820,6 +894,14 @@ describe('expand', () => { } ); + it.each([/©×/i, /\251\327/i, /\xA9\xD7/i, /\u00A9\u00D7/i])( + 'does not expand uncased extended ASCII characters when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['©×']); + } + ); + it.each([/[aB]/, /[\141\102]/, /[\x61\x42]/, /[\u0061\u0042]/])( 'expands exact casing in static set when the case-insensitive flag is omitted: %p', (input: RegExp) => { @@ -828,6 +910,14 @@ describe('expand', () => { } ); + it.each([/[àÑ]/, /[\340\321]/, /[\xE0\xD1]/, /[\u00E0\u00D1]/])( + 'expands extended ASCII exact casing in static set when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['à', 'Ñ']); + } + ); + it.each([/[aB]/i, /[\141\102]/i, /[\x61\x42]/i, /[\u0061\u0042]/i])( 'expands casing variants in static set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -836,6 +926,14 @@ describe('expand', () => { } ); + it.each([/[àÑ]/i, /[\340\321]/i, /[\xE0\xD1]/i, /[\u00E0\u00D1]/i])( + 'expands extended ASCII casing variants in static set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['à', 'À', 'ñ', 'Ñ']); + } + ); + it.each([/[4%]/i, /[\64\45]/i, /[\x34\x25]/i, /[\u0034\u0025]/i])( 'does not expand uncased characters in static set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -844,6 +942,14 @@ describe('expand', () => { } ); + it.each([/[©×]/i, /[\251\327]/i, /[\xA9\xD7]/i, /[\u00A9\u00D7]/i])( + 'does not expand uncased extended ASCII characters in static set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['©', '×']); + } + ); + it.each([/[a-d]/, /[\141-\144]/, /[\x61-\x64]/, /[\u0061-\u0064]/])( 'expands exact casing in range set when the case-insensitive flag is omitted: %p', (input: RegExp) => { @@ -852,6 +958,14 @@ describe('expand', () => { } ); + it.each([/[Ì-Ï]/, /[\314-\317]/, /[\xCC-\xCF]/, /[\u00CC-\u00CF]/])( + 'expands extended ASCII exact casing in range set when the case-insensitive flag is omitted: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['Ì', 'Í', 'Î', 'Ï']); + } + ); + it.each([/[a-d]/i, /[\141-\144]/i, /[\x61-\x64]/i, /[\u0061-\u0064]/i])( 'expands casing variants in range set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -860,6 +974,14 @@ describe('expand', () => { } ); + it.each([/[Ì-Ï]/i, /[\314-\317]/i, /[\xCC-\xCF]/i, /[\u00CC-\u00CF]/i])( + 'expands extended ASCII casing variants in range set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['ì', 'Ì', 'í', 'Í', 'î', 'Î', 'ï', 'Ï']); + } + ); + it.each([/[1-4]/i, /[\61-\64]/i, /[\x31-\x34]/i, /[\u0031-\u0034]/i])( 'does not expand uncased characters in range set when the case-insensitive flag is included: %p', (input: RegExp) => { @@ -868,6 +990,14 @@ describe('expand', () => { } ); + it.each([/[¼-¿]/i, /[\274-\277]/i, /[\xBC-\xBF]/i, /[\u00BC-\u00BF]/i])( + 'does not expand extended ASCII uncased characters in range set when the case-insensitive flag is included: %p', + (input: RegExp) => { + const result = expandAll(input); + expect(result).toEqual(['¼', '½', '¾', '¿']); + } + ); + it.each(['.', /./])( 'does not expand the dot character to a newline when the dotall flag is omitted %#', (input: string | RegExp) => { diff --git a/src/transforms/meta-to-char-class-transform.ts b/src/transforms/meta-to-char-class-transform.ts index db04b5e..e6b425a 100644 --- a/src/transforms/meta-to-char-class-transform.ts +++ b/src/transforms/meta-to-char-class-transform.ts @@ -7,13 +7,9 @@ import { CharacterClass, SpecialChar, } from 'regexp-tree/ast'; +import { Chars } from '../constants'; import * as Guards from '../types/regexp-tree-guards'; -import { - createClassRange, - createEscapedSimpleChar, - createSimpleChar, - createSimpleChars, -} from './utils'; +import { createEscapedSimpleChar, createSimpleChar } from './utils'; type Replace = ( parentNode: AsExpression, @@ -59,21 +55,19 @@ const replacer: NodeReplacer = { }, }; -const optionsAlpha = [createClassRange('a', 'z'), createClassRange('A', 'Z')]; -const optionsDigit = createClassRange('0', '9'); -const optionUnderscore = createEscapedSimpleChar('_'); -const optionsWhitespaceNoBreak = createSimpleChars(' \t'); -const optionsWhitespace = [ - ...optionsWhitespaceNoBreak, - ...createSimpleChars('\r\n'), -]; +const optionsAlpha = Chars.basicAlpha.map(createSimpleChar); +const optionsDigit = Chars.digits.map(createSimpleChar); + +const optionsWhitespace = Chars.whitespace.map(createSimpleChar); + +const needEscape = [']', '-', '\\']; +const noEscape = Chars.basicSpecial.filter(c => !needEscape.includes(c)); const optionsOther = [ - ...createSimpleChars('~`!@#$%^&*()=+<,>.?/[{}|:;"\''), - createEscapedSimpleChar(']'), - createEscapedSimpleChar('-'), - createEscapedSimpleChar('\\'), + ...noEscape.map(createSimpleChar), + ...needEscape.map(createEscapedSimpleChar), ]; -const optionsNewLine = createSimpleChar('\n'); + +const optionsExtended = Chars.extended.map(createSimpleChar); function getMetaCharExpressions( metaChar: SpecialChar, @@ -81,34 +75,44 @@ function getMetaCharExpressions( ): CharacterClass['expressions'] { switch (metaChar.value) { case '.': { - const dotAllOptions = regExpFlags.includes('s') ? [optionsNewLine] : []; + const optionsNewLine = createSimpleChar('\n'); + const optionsDotAll = regExpFlags.includes('s') ? [optionsNewLine] : []; + const whitespaceNoBreaks = Chars.whitespace.filter( + c => !'\r\n'.includes(c) + ); + const optionsWhitespaceNoBreak = whitespaceNoBreaks.map(createSimpleChar); return [ ...optionsAlpha, - optionsDigit, + ...optionsDigit, ...optionsWhitespaceNoBreak, ...optionsOther, - optionUnderscore, - ...dotAllOptions, + ...optionsExtended, + ...optionsDotAll, ]; } case '\\w': - return [...optionsAlpha, optionsDigit, optionUnderscore]; + return [...optionsAlpha, ...optionsDigit]; case '\\W': - return [...optionsWhitespace, ...optionsOther]; + return [...optionsWhitespace, ...optionsOther, ...optionsExtended]; case '\\d': - return [optionsDigit]; + return optionsDigit; case '\\D': return [ ...optionsAlpha, ...optionsWhitespace, ...optionsOther, - optionUnderscore, + ...optionsExtended, ]; case '\\s': return optionsWhitespace; case '\\S': - return [...optionsAlpha, optionsDigit, ...optionsOther, optionUnderscore]; + return [ + ...optionsAlpha, + ...optionsDigit, + ...optionsOther, + ...optionsExtended, + ]; default: return []; } diff --git a/src/transforms/utils.ts b/src/transforms/utils.ts index 61e91fa..fb76dca 100644 --- a/src/transforms/utils.ts +++ b/src/transforms/utils.ts @@ -1,4 +1,4 @@ -import { ClassRange, SimpleChar } from 'regexp-tree/ast'; +import { SimpleChar } from 'regexp-tree/ast'; export function createSimpleChar(value: string): SimpleChar { if (value.length !== 1) throw new Error('value must be a char'); @@ -18,15 +18,3 @@ export function createEscapedSimpleChar(value: string): SimpleChar { escaped: true, }; } - -export function createSimpleChars(values: string): SimpleChar[] { - return values.split('').map(createSimpleChar); -} - -export function createClassRange(from: string, to: string): ClassRange { - return { - from: createSimpleChar(from), - to: createSimpleChar(to), - type: 'ClassRange', - }; -}