Skip to content

Commit e9b4ad4

Browse files
[JsonPath] Handle special whitespaces in filters
1 parent c492fc0 commit e9b4ad4

File tree

6 files changed

+543
-414
lines changed

6 files changed

+543
-414
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 137 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133133
return [];
134134
}
135135

136-
if ('*' === $expr) {
136+
if (str_contains($expr, ',') && (str_starts_with($trimmed = trim($expr), ',') || str_ends_with($trimmed, ','))) {
137+
throw new JsonCrawlerException($expr, 'Expression cannot have leading or trailing commas');
138+
}
139+
140+
if ('*' === $expr = JsonPathUtils::normalizeWhitespace($expr)) {
137141
return array_values($value);
138142
}
139143

@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168172
return $result;
169173
}
170174

171-
// start, end and step
172-
if (preg_match('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/', $expr, $matches)) {
175+
if (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $expr, $matches)) {
173176
if (!array_is_list($value)) {
174177
return [];
175178
}
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217220

218221
// filter expressions
219222
if (preg_match('/^\?(.*)$/', $expr, $matches)) {
220-
$filterExpr = $matches[1];
221-
222-
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr)) {
223+
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr = trim($matches[1]))) {
223224
$filterExpr = "($filterExpr)";
224225
}
225226

226227
if (!str_starts_with($filterExpr, '(')) {
227-
throw new JsonCrawlerException($expr, 'Invalid filter expression');
228+
$filterExpr = "($filterExpr)";
228229
}
229230

230231
// remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235236

236237
// comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237238
if (str_contains($expr, ',')) {
238-
$parts = $this->parseCommaSeparatedValues($expr);
239+
$parts = JsonPathUtils::parseCommaSeparatedValues($expr);
239240

240241
$result = [];
241-
$keysIndices = array_keys($value);
242-
$isList = array_is_list($value);
243242

244243
foreach ($parts as $part) {
245244
$part = trim($part);
246245

247-
if (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
246+
if ('*' === $part) {
247+
$result = array_merge($result, array_values($value));
248+
} elseif (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $part, $matches)) {
249+
// slice notation
250+
$sliceResult = $this->evaluateBracket($part, $value);
251+
$result = array_merge($result, $sliceResult);
252+
} elseif (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
248253
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
249254

250-
if ($isList) {
255+
if (array_is_list($value)) {
256+
// for arrays, find ALL objects that contain this key
251257
foreach ($value as $item) {
252258
if (\is_array($item) && \array_key_exists($key, $item)) {
253259
$result[] = $item;
254-
break;
255260
}
256261
}
257-
258-
continue; // no results here
259-
}
260-
261-
if (\array_key_exists($key, $value)) {
262+
} elseif (\array_key_exists($key, $value)) { // for objects, get the value for this key
262263
$result[] = $value[$key];
263264
}
264265
} elseif (preg_match('/^-?\d+$/', $part)) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268269
$index = \count($value) + $index;
269270
}
270271

271-
if ($isList && \array_key_exists($index, $value)) {
272+
if (array_is_list($value) && \array_key_exists($index, $value)) {
272273
$result[] = $value[$index];
273-
continue;
274-
}
275-
276-
// numeric index on a hashmap
277-
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278-
$result[] = $value[$keysIndices[$index]];
274+
} else {
275+
// numeric index on a hashmap
276+
$keysIndices = array_keys($value);
277+
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278+
$result[] = $value[$keysIndices[$index]];
279+
}
279280
}
280281
}
281282
}
@@ -310,7 +311,31 @@ private function evaluateFilter(string $expr, mixed $value): array
310311

311312
private function evaluateFilterExpression(string $expr, mixed $context): bool
312313
{
313-
$expr = trim($expr);
314+
$expr = JsonPathUtils::normalizeWhitespace($expr);
315+
316+
// remove outer parentheses if they wrap the entire expression
317+
if (str_starts_with($expr, '(') && str_ends_with($expr, ')')) {
318+
$depth = 0;
319+
$isWrapped = true;
320+
for ($i = 0; $i < \strlen($expr); ++$i) {
321+
if ('(' === $expr[$i]) {
322+
++$depth;
323+
} elseif (')' === $expr[$i]) {
324+
--$depth;
325+
if (0 === $depth && $i < \strlen($expr) - 1) {
326+
$isWrapped = false;
327+
break;
328+
}
329+
}
330+
}
331+
if ($isWrapped) {
332+
$expr = trim(substr($expr, 1, -1));
333+
}
334+
}
335+
336+
if (str_starts_with($expr, '!')) {
337+
return !$this->evaluateFilterExpression(trim(substr($expr, 1)), $context);
338+
}
314339

315340
if (str_contains($expr, '&&')) {
316341
$parts = array_map('trim', explode('&&', $expr));
@@ -353,8 +378,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353378
}
354379

355380
// function calls
356-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
357-
$functionName = $matches[1];
381+
if (preg_match('/^(\w++)\s*+\((.*)\)$/', $expr, $matches)) {
382+
$functionName = trim($matches[1]);
358383
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
359384
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
360385
}
@@ -369,8 +394,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369394

370395
private function evaluateScalar(string $expr, mixed $context): mixed
371396
{
372-
if (is_numeric($expr)) {
373-
return str_contains($expr, '.') ? (float) $expr : (int) $expr;
397+
$expr = JsonPathUtils::normalizeWhitespace($expr);
398+
399+
if (JsonPathUtils::isJsonNumber($expr)) {
400+
return str_contains($expr, '.') || str_contains(strtolower($expr), 'e') ? (float) $expr : (int) $expr;
401+
}
402+
403+
// only validate tokens that look like standalone numbers
404+
if (preg_match('/^[\d+\-.eE]+$/', $expr) && preg_match('/\d/', $expr)) {
405+
throw new JsonCrawlerException($expr, \sprintf('Invalid number format "%s"', $expr));
374406
}
375407

376408
if ('@' === $expr) {
@@ -404,9 +436,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404436
}
405437

406438
// function calls
407-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
408-
$functionName = $matches[1];
409-
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
439+
if (preg_match('/^(\w++)\((.*)\)$/', $expr, $matches)) {
440+
if (!isset(self::RFC9535_FUNCTIONS[$functionName = trim($matches[1])])) {
410441
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
411442
}
412443

@@ -416,31 +447,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416447
return null;
417448
}
418449

419-
private function evaluateFunction(string $name, string $args, array $context): mixed
450+
private function evaluateFunction(string $name, string $args, mixed $context): mixed
420451
{
421-
$args = array_map(
422-
fn ($arg) => $this->evaluateScalar(trim($arg), $context),
423-
explode(',', $args)
424-
);
452+
$argList = [];
453+
$nodelistSizes = [];
454+
if ($args = trim($args)) {
455+
$args = JsonPathUtils::parseCommaSeparatedValues($args);
456+
foreach ($args as $arg) {
457+
$arg = trim($arg);
458+
if (str_starts_with($arg, '$')) { // special handling for absolute paths
459+
$results = $this->evaluate(new JsonPath($arg));
460+
$argList[] = $results[0] ?? null;
461+
$nodelistSizes[] = \count($results);
462+
} elseif (!str_starts_with($arg, '@')) { // special handling for @ to track nodelist size
463+
$argList[] = $this->evaluateScalar($arg, $context);
464+
$nodelistSizes[] = 1;
465+
} elseif ('@' === $arg) {
466+
$argList[] = $context;
467+
$nodelistSizes[] = 1;
468+
} elseif (!\is_array($context)) {
469+
$argList[] = null;
470+
$nodelistSizes[] = 0;
471+
} elseif (str_starts_with($pathPart = substr($arg, 1), '[')) {
472+
// handle bracket expressions like @['a','d']
473+
$results = $this->evaluateBracket(substr($pathPart, 1, -1), $context);
474+
$argList[] = $results;
475+
$nodelistSizes[] = \count($results);
476+
} else {
477+
// handle dot notation like @.a
478+
$results = $this->evaluateTokensOnDecodedData(JsonPathTokenizer::tokenize(new JsonPath('$'.$pathPart)), $context);
479+
$argList[] = $results[0] ?? null;
480+
$nodelistSizes[] = \count($results);
481+
}
482+
}
483+
}
425484

426-
$value = $args[0] ?? null;
485+
$value = $argList[0] ?? null;
486+
$nodelistSize = $nodelistSizes[0] ?? 0;
427487

428488
return match ($name) {
429489
'length' => match (true) {
430490
\is_string($value) => mb_strlen($value),
431491
\is_array($value) => \count($value),
432492
default => 0,
433493
},
434-
'count' => \is_array($value) ? \count($value) : 0,
494+
'count' => $nodelistSize,
435495
'match' => match (true) {
436-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/', $args[1]), $value),
496+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/u', $this->transformJsonPathRegex($argList[1])), $value),
437497
default => false,
438498
},
439499
'search' => match (true) {
440-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match("/$args[1]/", $value),
500+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match("/{$this->transformJsonPathRegex($argList[1])}/u", $value),
441501
default => false,
442502
},
443-
'value' => $value,
503+
'value' => 1 < $nodelistSize ? null : (1 === $nodelistSize ? (\is_array($value) ? ($value[0] ?? null) : $value) : $value),
444504
default => null,
445505
};
446506
}
@@ -474,43 +534,52 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474534
};
475535
}
476536

477-
private function parseCommaSeparatedValues(string $expr): array
537+
/*
538+
* Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
539+
* that '.' should not match \r or \n but should match Unicode line
540+
* separators U+2028 and U+2029.
541+
*/
542+
private function transformJsonPathRegex(string $pattern): string
478543
{
479-
$parts = [];
480-
$current = '';
481-
$inQuotes = false;
482-
$quoteChar = null;
544+
$result = '';
545+
$inCharClass = false;
546+
$escaped = false;
547+
$length = \strlen($pattern);
483548

484-
for ($i = 0; $i < \strlen($expr); ++$i) {
485-
$char = $expr[$i];
549+
for ($i = 0; $i < $length; ++$i) {
550+
$char = $pattern[$i];
486551

487-
if ('\\' === $char && $i + 1 < \strlen($expr)) {
488-
$current .= $char.$expr[++$i];
552+
if ($escaped) {
553+
$result .= $char;
554+
$escaped = false;
489555
continue;
490556
}
491557

492-
if ('"' === $char || "'" === $char) {
493-
if (!$inQuotes) {
494-
$inQuotes = true;
495-
$quoteChar = $char;
496-
} elseif ($char === $quoteChar) {
497-
$inQuotes = false;
498-
$quoteChar = null;
499-
}
500-
} elseif (!$inQuotes && ',' === $char) {
501-
$parts[] = trim($current);
502-
$current = '';
558+
if ('\\' === $char) {
559+
$result .= $char;
560+
$escaped = true;
561+
continue;
562+
}
503563

564+
if ('[' === $char && !$inCharClass) {
565+
$inCharClass = true;
566+
$result .= $char;
504567
continue;
505568
}
506569

507-
$current .= $char;
508-
}
570+
if (']' === $char && $inCharClass) {
571+
$inCharClass = false;
572+
$result .= $char;
573+
continue;
574+
}
509575

510-
if ('' !== $current) {
511-
$parts[] = trim($current);
576+
if ('.' === $char && !$inCharClass) {
577+
$result .= '(?:[^\r\n]|\x{2028}|\x{2029})';
578+
} else {
579+
$result .= $char;
580+
}
512581
}
513582

514-
return $parts;
583+
return $result;
515584
}
516585
}

0 commit comments

Comments
 (0)