Skip to content

Commit 19e186f

Browse files
committed
Update preg_match_all.js
Placed private functions in a closure to avoid polluting the global scope.
1 parent d2ea72b commit 19e186f

File tree

1 file changed

+190
-163
lines changed

1 file changed

+190
-163
lines changed

workbench/pcre/preg_match_all.js

Lines changed: 190 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
function preg_match_all(pattern, s, flag, offset) {
1+
var preg_match_all = (function() {
22
// discuss at:
33
// original by: Camille Hodoul (http://webdev-snippets.net)
44
// note: The pattern must be a RegExp object.
@@ -13,187 +13,214 @@ function preg_match_all(pattern, s, flag, offset) {
1313
// returns 1: ['X-MyHeader','X-AZE','USERAGENT'],
1414
// returns 1: ['MyValue','adqdsdfff','Chrome123123 é']]
1515

16-
// UNFINISHED this is a work in progress.
17-
// It's using Javascript's regex engine, which is different from PHP's PCRE.
18-
19-
var order = flag || 'PREG_PATTERN_ORDER';
20-
// TODO support flag combination
21-
var matches = [];
22-
var nbP = find_parens_sub(pattern.source);
23-
24-
if(typeof (offset) !== 'undefined' && offset > 0) {
25-
// try to reproduce the behavior of the offset parameter, but I'm not sure how to test it.
26-
// I have to rebuild a pattern.
27-
var ps = pattern.toString();
28-
var delimiter = ps.charAt(0);
29-
30-
// FIXME : If the user has escaped his delimiter in the pattern, I should unescape it before passing it to the RegExp constructor.
31-
// but this should be done after the .join() ofc
32-
33-
var t = ps.split(delimiter);
34-
t.shift();
35-
var flags = t.pop();
36-
t[0] = '.{' + offset + '}' + t[0];
37-
ps = t.join(delimiter);
38-
pattern = new RegExp(ps, flags); // Have to rebuild it at runtime so no literal...
39-
}
16+
// UNFINISHED this is a work in progress.
17+
// It's using Javascript's regex engine, which is different from PHP's PCRE.
4018

41-
42-
// If the flag is 2 or 3, I should init the matches array with n+1 arrays
43-
// Where n = nb of capturing parentheses
44-
45-
if(order == 'PREG_PATTERN_ORDER' || order == 'PREG_OFFSET_CAPTURE') {
46-
for(var i = 0; i < 1 + nbP; i++) {
47-
matches[i] = [];
48-
}
49-
}
5019

51-
s.replace(pattern, function () {
52-
var args = [].slice.call(arguments);
53-
// Remove unnecessary elements from the args array
54-
var fullMatch = args.pop();
55-
var offset = args.pop();
56-
var substr = args[0];
57-
// args now only contains the matches
58-
if(order === 'PREG_SET_ORDER') {
59-
matches.push(args);
60-
} else if(order === 'PREG_PATTERN_ORDER') {
61-
var l = args.length;
62-
matches[0].push(substr);
63-
for(var i = 1; i < l; i++) {
64-
if(!matches[(i)]) matches[(i)] = [];
65-
matches[(i)].push(args[i]);
66-
}
67-
} else if(order === 'PREG_OFFSET_CAPTURE') {
68-
if(!matches[0]) matches[0] = [];
69-
matches[0].push([args[0], offset]);
70-
var l = args.length;
71-
for(var i = 1; i < l; i++) {
72-
if(!matches[i]) matches[i] = [];
73-
matches[i].push([args[i], fullMatch.indexOf(args[i])]);
20+
21+
22+
// _find_parens_sub and _strcmp are declared in the closure, to avoid polluting the global scope.
23+
24+
// _find_parens_sub : I copied this function from http://www.opensource.apple.com/source/pcre/pcre-4.2/pcre/pcre_compile.c ,
25+
// but removed the parts I don't need, so it isn't an exact implementation.
26+
// the comments in the code are from the original file.
27+
// This functions returns the number of capturing parentheses in a pattern
28+
function _find_parens_sub(ptr, count) {
29+
var count = count || 0;
30+
var start_count = count;
31+
var hwm_count = start_count;
32+
var i = 0;
33+
var dup_parens = false;
34+
// If the first character is a parenthesis, check on the type of group we are
35+
// dealing with. The very first call may not start with a parenthesis.
36+
if(ptr[0] == '(') {
37+
if(ptr[1] == '?' && ptr[2] == '|') {
38+
i += 3;
39+
dup_parens = true;
7440
}
75-
}
76-
});
77-
return matches;
78-
}
79-
80-
81-
// I copied this function from http://www.opensource.apple.com/source/pcre/pcre-4.2/pcre/pcre_compile.c ,
82-
// but removed the parts I don't need, so it isn't an exact implementation. It should do the trick though.
83-
84-
function find_parens_sub(ptr, count) {
85-
var count = count || 0;
86-
var start_count = count;
87-
var hwm_count = start_count;
88-
var i = 0;
89-
var dup_parens = false;
90-
// If the first character is a parenthesis, check on the type of group we are
91-
// dealing with. The very first call may not start with a parenthesis.
92-
if(ptr[0] == '(') {
93-
if(ptr[1] == '?' && ptr[2] == '|') {
94-
i += 3;
95-
dup_parens = true;
96-
}
9741

98-
// Handle a normal, unnamed capturing parenthesis
99-
else if(ptr[1] != '?' && ptr[1] != '*') {
100-
count += 1;
101-
i++;
102-
}
103-
// Handle a condition. If it is an assertion, just carry on so that it
104-
// is processed as normal. If not, skip to the closing parenthesis of the
105-
// condition (there can't be any nested parens. */
106-
else if(ptr[i + 2] == '(') {
107-
i += 2;
108-
if(ptr[i + 1] != '(') {
109-
while(!!ptr[i] && ptr[i] != ')') i++;
110-
if(ptr[i] != 0) i++;
42+
// Handle a normal, unnamed capturing parenthesis
43+
else if(ptr[1] != '?' && ptr[1] != '*') {
44+
count += 1;
45+
i++;
11146
}
112-
}
113-
// We have either (? or (* and not a condition
114-
else {
115-
i += 2;
116-
if(ptr[i] == 'P') i++;
117-
118-
// We have to disambiguate (?<! and (?<= from (?<name> for named groups
119-
if((ptr[i] == '<' && ptr[i + 1] != '!' && ptr[i + 1] != '=') || ptr[i] == '\'') {
120-
count++;
47+
// Handle a condition. If it is an assertion, just carry on so that it
48+
// is processed as normal. If not, skip to the closing parenthesis of the
49+
// condition (there can't be any nested parens. */
50+
else if(ptr[i + 2] == '(') {
51+
i += 2;
52+
if(ptr[i + 1] != '(') {
53+
while(!!ptr[i] && ptr[i] != ')') i++;
54+
if(ptr[i] != 0) i++;
55+
}
12156
}
122-
}
123-
}
57+
// We have either (? or (* and not a condition
58+
else {
59+
i += 2;
60+
if(ptr[i] == 'P') i++;
12461

125-
// Past any initial parenthesis handling, scan for parentheses or vertical
126-
// bars.
127-
for(; !!ptr[i]; i++) {
128-
// Skip over backslashed characters and also entire \Q...\E
129-
if(ptr[i] == '\\') {
130-
if(!ptr[++i]) throw new Error('Weird backslash ?');
131-
if(ptr[i] == 'Q') {
132-
for(;;) {
133-
while(!!ptr[++i] && ptr[i] != '\\') {};
134-
if(!ptr[i]) throw new Error('No \\E ?');
135-
if(ptr[++i] == 'E') break;
62+
// We have to disambiguate (?<! and (?<= from (?<name> for named groups
63+
if((ptr[i] == '<' && ptr[i + 1] != '!' && ptr[i + 1] != '=') || ptr[i] == '\'') {
64+
count++;
13665
}
137-
}
138-
continue;
66+
}
13967
}
140-
// Skip over character classes; this logic must be similar to the way they
141-
// are handled for real. If the first character is '^', skip it. Also, if the
142-
// first few characters (either before or after ^) are \Q\E or \E we skip them
143-
// too.
144-
if(ptr[i] == '[') {
145-
var negate_class = false;
146-
for(;;) {
147-
var c = ptr[++i];
148-
if(c == '\\') {
149-
if(ptr[i] == 'E') i++;
150-
else if(!strncmp(ptr[i + 1]), 'Q\\E', 3) {
151-
i += 3;
152-
} else {
153-
break;
68+
69+
// Past any initial parenthesis handling, scan for parentheses or vertical
70+
// bars.
71+
for(; !!ptr[i]; i++) {
72+
// Skip over backslashed characters and also entire \Q...\E
73+
if(ptr[i] == '\\') {
74+
if(!ptr[++i]) throw new Error('Weird backslash ?');
75+
if(ptr[i] == 'Q') {
76+
for(;;) {
77+
while(!!ptr[++i] && ptr[i] != '\\') {};
78+
if(!ptr[i]) throw new Error('No \\E ?');
79+
if(ptr[++i] == 'E') break;
15480
}
155-
} else if(!negate_class && c == '^') {
156-
negate_class = true;
157-
} else break;
15881
}
82+
continue;
83+
}
84+
// Skip over character classes; this logic must be similar to the way they
85+
// are handled for real. If the first character is '^', skip it. Also, if the
86+
// first few characters (either before or after ^) are \Q\E or \E we skip them
87+
// too.
88+
if(ptr[i] == '[') {
89+
var negate_class = false;
90+
for(;;) {
91+
var c = ptr[++i];
92+
if(c == '\\') {
93+
if(ptr[i] == 'E') i++;
94+
else if(!_strncmp(ptr[i + 1]), 'Q\\E', 3) {
95+
i += 3;
96+
} else {
97+
break;
98+
}
99+
} else if(!negate_class && c == '^') {
100+
negate_class = true;
101+
} else break;
102+
}
159103

160104

161-
// If the next character is ']', it is a data character that must be
162-
// skipped, except in JavaScript compatibility mode
163-
if(ptr[i] == ']' && false) {
164-
i++;
165-
}
166-
while(ptr[++i] != ']') {
167-
if(!ptr[i]) {
168-
return count;
105+
// If the next character is ']', it is a data character that must be
106+
// skipped, except in JavaScript compatibility mode
107+
if(ptr[i] == ']' && false) {
108+
i++;
169109
}
170-
if(ptr[i] == '\\') {
171-
if(!ptr[++i]) throw new Error('Weird backslash ?');
172-
if(ptr[i] == 'Q') {
173-
for(;;) {
174-
while(!!ptr[++i] && ptr[i] != '\\') {};
175-
if(!ptr[i]) throw new Error('No \\E ?');
176-
if(ptr[++i] == 'E') break;
110+
while(ptr[++i] != ']') {
111+
if(!ptr[i]) {
112+
return count;
113+
}
114+
if(ptr[i] == '\\') {
115+
if(!ptr[++i]) throw new Error('Weird backslash ?');
116+
if(ptr[i] == 'Q') {
117+
for(;;) {
118+
while(!!ptr[++i] && ptr[i] != '\\') {};
119+
if(!ptr[i]) throw new Error('No \\E ?');
120+
if(ptr[++i] == 'E') break;
121+
}
177122
}
178-
}
179-
continue;
123+
continue;
124+
}
180125
}
126+
continue;
181127
}
182-
continue;
183-
}
184128

185129

186-
// Check for the special metacharacters
187-
if(ptr[i] == '(') {
130+
// Check for the special metacharacters
131+
if(ptr[i] == '(') {
188132

189-
count = find_parens_sub(ptr.slice(i), count);
190-
return count;
191-
} else if(ptr[i] == ')') {
192-
if(dup_parens && count < hwm_count) count = hwm_count;
193-
} else if(ptr[i] == '|' && dup_parens) {
194-
if(count > hwm_count) hwm_count = count;
195-
count = start_count;
133+
count = _find_parens_sub(ptr.slice(i), count);
134+
return count;
135+
} else if(ptr[i] == ')') {
136+
if(dup_parens && count < hwm_count) count = hwm_count;
137+
} else if(ptr[i] == '|' && dup_parens) {
138+
if(count > hwm_count) hwm_count = count;
139+
count = start_count;
140+
}
196141
}
142+
return count;
197143
}
198-
return count;
199-
}
144+
145+
146+
function _strncmp(str1, str2, lgth) {
147+
// discuss at: http://phpjs.org/functions/strncmp/
148+
// original by: Waldo Malqui Silva
149+
// input by: Steve Hilder
150+
// improved by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
151+
// revised by: gorthaur
152+
// reimplemented by: Brett Zamir (http://brett-zamir.me)
153+
154+
var s1 = (str1 + '')
155+
.substr(0, lgth);
156+
var s2 = (str2 + '')
157+
.substr(0, lgth);
158+
159+
return((s1 == s2) ? 0 : ((s1 > s2) ? 1 : -1));
160+
}
161+
162+
// the actual function
163+
return function(pattern, s, flag, offset) {
164+
165+
var order = flag || 'PREG_PATTERN_ORDER';
166+
// TODO support flag combination
167+
var matches = [];
168+
var nbP = _find_parens_sub(pattern.source);
169+
170+
if(typeof (offset) !== 'undefined' && offset > 0) {
171+
// try to reproduce the behavior of the offset parameter, but I'm not sure how to test it.
172+
// I have to rebuild a pattern.
173+
var ps = pattern.toString();
174+
var delimiter = ps.charAt(0);
175+
176+
// FIXME : If the user has escaped his delimiter in the pattern, I should unescape it before passing it to the RegExp constructor.
177+
// but this should be done after the .join() ofc
178+
179+
var t = ps.split(delimiter);
180+
t.shift();
181+
var flags = t.pop();
182+
t[0] = '.{' + offset + '}' + t[0];
183+
ps = t.join(delimiter);
184+
// Have to rebuild it at runtime so no literal...
185+
pattern = new RegExp(ps, flags);
186+
}
187+
188+
189+
// If the flag is 2 or 3, I should init the matches array with n+1 arrays
190+
// Where n = nb of capturing parentheses
191+
192+
if(order == 'PREG_PATTERN_ORDER' || order == 'PREG_OFFSET_CAPTURE') {
193+
for(var i = 0; i < 1 + nbP; i++) {
194+
matches[i] = [];
195+
}
196+
}
197+
198+
s.replace(pattern, function () {
199+
var args = [].slice.call(arguments);
200+
// Remove unnecessary elements from the args array
201+
var fullMatch = args.pop();
202+
var offset = args.pop();
203+
var substr = args[0];
204+
// args now only contains the matches
205+
if(order === 'PREG_SET_ORDER') {
206+
matches.push(args);
207+
} else if(order === 'PREG_PATTERN_ORDER') {
208+
var l = args.length;
209+
matches[0].push(substr);
210+
for(var i = 1; i < l; i++) {
211+
if(!matches[(i)]) matches[(i)] = [];
212+
matches[(i)].push(args[i]);
213+
}
214+
} else if(order === 'PREG_OFFSET_CAPTURE') {
215+
if(!matches[0]) matches[0] = [];
216+
matches[0].push([args[0], offset]);
217+
var l = args.length;
218+
for(var i = 1; i < l; i++) {
219+
if(!matches[i]) matches[i] = [];
220+
matches[i].push([args[i], fullMatch.indexOf(args[i])]);
221+
}
222+
}
223+
});
224+
return matches;
225+
};
226+
})();

0 commit comments

Comments
 (0)