Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ccf15cf

Browse files
committedJan 14, 2013
Optimize strtr w/ 2nd arg array
Fixes bug #63893: poor efficiency of strtr() using array with keys of very different length. The implementation is basically all new, which carries some risk with it. The algorithm is described in "A Fast Algorithm For Multi-Pattern Searching" (1994) by Sun Wu and Udi Manber.
1 parent 1a96fe0 commit ccf15cf

File tree

1 file changed

+265
-86
lines changed

1 file changed

+265
-86
lines changed
 

‎ext/standard/string.c

Lines changed: 265 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222

2323
/* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */
2424

25+
#define _GNU_SOURCE 1
2526
#include <stdio.h>
27+
#include <stdint.h>
2628
#include "php.h"
2729
#include "php_rand.h"
2830
#include "php_string.h"
@@ -57,6 +59,7 @@
5759
#include "php_globals.h"
5860
#include "basic_functions.h"
5961
#include "php_smart_str.h"
62+
#include <Zend/zend_exceptions.h>
6063
#ifdef ZTS
6164
#include "TSRM.h"
6265
#endif
@@ -2772,112 +2775,288 @@ PHPAPI char *php_strtr(char *str, int len, char *str_from, char *str_to, int trl
27722775
}
27732776
/* }}} */
27742777

2775-
/* {{{ php_strtr_array
2776-
*/
2777-
static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *hash)
2778+
/* {{{ Definitions for php_strtr_array */
2779+
typedef size_t STRLEN; /* STRLEN should be unsigned */
2780+
typedef uint16_t HASH;
2781+
typedef struct {
2782+
HASH table_mask;
2783+
STRLEN entries[1];
2784+
} SHIFT_TAB;
2785+
typedef struct {
2786+
HASH table_mask;
2787+
int entries[1];
2788+
} HASH_TAB;
2789+
typedef struct {
2790+
const char *s;
2791+
STRLEN l;
2792+
} STR;
2793+
typedef struct _match_node MATCH_NODE;
2794+
struct _match_node {
2795+
STRLEN pos;
2796+
MATCH_NODE *next;
2797+
};
2798+
typedef struct _pat_and_repl {
2799+
STR pat;
2800+
STR repl;
2801+
} PATNREPL;
2802+
2803+
#define S(a) ((a)->s)
2804+
#define L(a) ((a)->l)
2805+
2806+
#define SHIFT_TAB_BITS 13
2807+
#define HASH_TAB_BITS 10 /* should be less than sizeof(HASH) * 8 */
2808+
#define SHIFT_TAB_SIZE (1U << SHIFT_TAB_BITS)
2809+
#define HASH_TAB_SIZE (1U << HASH_TAB_BITS)
2810+
2811+
typedef struct {
2812+
int B; /* size of suffixes */
2813+
int Bp; /* size of prefixes */
2814+
STRLEN m; /* minimum pattern length */
2815+
int patnum; /* number of patterns */
2816+
SHIFT_TAB *shift; /* table mapping hash to allowed shift */
2817+
HASH_TAB *hash; /* table mapping hash to int (pair of pointers) */
2818+
HASH *prefix; /* array of hashes of prefixes by pattern suffix hash order */
2819+
PATNREPL *patterns; /* array of prefixes by pattern suffix hash order */
2820+
} PPRES;
2821+
/* }}} */
2822+
2823+
/* {{{ php_strtr_hash */
2824+
static inline HASH php_strtr_hash(const char *str, int len)
27782825
{
2779-
zval **entry;
2780-
char *string_key;
2781-
uint string_key_len;
2782-
zval **trans;
2783-
zval ctmp;
2784-
ulong num_key;
2785-
int minlen = 128*1024;
2786-
int maxlen = 0, pos, len, found;
2787-
char *key;
2788-
HashPosition hpos;
2789-
smart_str result = {0};
2790-
HashTable tmp_hash;
2791-
2792-
zend_hash_init(&tmp_hash, zend_hash_num_elements(hash), NULL, NULL, 0);
2793-
zend_hash_internal_pointer_reset_ex(hash, &hpos);
2794-
while (zend_hash_get_current_data_ex(hash, (void **)&entry, &hpos) == SUCCESS) {
2795-
switch (zend_hash_get_current_key_ex(hash, &string_key, &string_key_len, &num_key, 0, &hpos)) {
2796-
case HASH_KEY_IS_STRING:
2797-
len = string_key_len-1;
2798-
if (len < 1) {
2799-
zend_hash_destroy(&tmp_hash);
2800-
RETURN_FALSE;
2801-
}
2802-
zend_hash_add(&tmp_hash, string_key, string_key_len, entry, sizeof(zval*), NULL);
2803-
if (len > maxlen) {
2804-
maxlen = len;
2805-
}
2806-
if (len < minlen) {
2807-
minlen = len;
2808-
}
2809-
break;
2826+
HASH res = 0;
2827+
int i;
2828+
for (i = 0; i < len; i++) {
2829+
res = (res << 5) + res + (unsigned char)str[i];
2830+
}
28102831

2811-
case HASH_KEY_IS_LONG:
2812-
Z_TYPE(ctmp) = IS_LONG;
2813-
Z_LVAL(ctmp) = num_key;
2832+
return res;
2833+
}
2834+
/* }}} */
2835+
/* {{{ php_strtr_populate_shift */
2836+
static inline void php_strtr_populate_shift(PATNREPL *patterns, int patnum, int B, STRLEN m, SHIFT_TAB *shift)
2837+
{
2838+
int i;
2839+
STRLEN j,
2840+
max_shift;
2841+
2842+
max_shift = m - B + 1;
2843+
for (i = 0; i < SHIFT_TAB_SIZE; i++) {
2844+
shift->entries[i] = max_shift;
2845+
}
2846+
for (i = 0; i < patnum; i++) {
2847+
for (j = 0; j < m - B + 1; j++) {
2848+
HASH h = php_strtr_hash(&S(&patterns[i].pat)[j], B) & shift->table_mask;
2849+
assert((long long) m - (long long) j - B >= 0);
2850+
shift->entries[h] = MIN(shift->entries[h], m - j - B);
2851+
}
2852+
}
2853+
}
2854+
/* }}} */
2855+
/* {{{ php_strtr_compare_hash_suffix */
2856+
static int php_strtr_compare_hash_suffix(const void *a, const void *b, void *ctx_g)
2857+
{
2858+
const PPRES *res = ctx_g;
2859+
const PATNREPL *pnr_a = a,
2860+
*pnr_b = b;
2861+
HASH hash_a = php_strtr_hash(&S(&pnr_a->pat)[res->m - res->B], res->B)
2862+
& res->hash->table_mask,
2863+
hash_b = php_strtr_hash(&S(&pnr_b->pat)[res->m - res->B], res->B)
2864+
& res->hash->table_mask;
2865+
/* TODO: don't recalculate the hashes all the time */
2866+
return hash_a - hash_b;
2867+
}
2868+
/* }}} */
28142869

2815-
convert_to_string(&ctmp);
2816-
len = Z_STRLEN(ctmp);
2817-
zend_hash_add(&tmp_hash, Z_STRVAL(ctmp), len+1, entry, sizeof(zval*), NULL);
2818-
zval_dtor(&ctmp);
2870+
/* {{{ PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp) */
2871+
static PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp)
2872+
{
2873+
int i;
2874+
PPRES *res = emalloc(sizeof *res);
28192875

2820-
if (len > maxlen) {
2821-
maxlen = len;
2822-
}
2823-
if (len < minlen) {
2824-
minlen = len;
2825-
}
2826-
break;
2876+
res->m = (STRLEN)-1;
2877+
for (i = 0; i < patnum; i++) {
2878+
if (L(&patterns[i].pat) < res->m) {
2879+
res->m = L(&patterns[i].pat);
28272880
}
2828-
zend_hash_move_forward_ex(hash, &hpos);
28292881
}
2882+
assert(res->m > 0);
2883+
res->B = B = MIN(B, res->m);
2884+
res->Bp = Bp = MIN(Bp, res->m);
28302885

2831-
key = emalloc(maxlen+1);
2832-
pos = 0;
2886+
res->shift = safe_emalloc(SHIFT_TAB_SIZE, sizeof(*res->shift->entries), sizeof(*res->shift));
2887+
res->shift->table_mask = SHIFT_TAB_SIZE - 1;
2888+
php_strtr_populate_shift(patterns, patnum, B, res->m, res->shift);
28332889

2834-
while (pos < slen) {
2835-
if ((pos + maxlen) > slen) {
2836-
maxlen = slen - pos;
2837-
}
2890+
res->hash = safe_emalloc(HASH_TAB_SIZE, sizeof(*res->hash->entries), sizeof(*res->shift));
2891+
res->hash->table_mask = HASH_TAB_SIZE - 1;
2892+
2893+
res->patterns = safe_emalloc(patnum, sizeof(*res->patterns), 0);
2894+
memcpy(res->patterns, patterns, sizeof(*patterns) * patnum);
2895+
qsort_r(res->patterns, patnum, sizeof(*res->patterns), php_strtr_compare_hash_suffix, res);
2896+
2897+
res->prefix = safe_emalloc(patnum, sizeof(*res->prefix), 0);
2898+
for (i = 0; i < patnum; i++) {
2899+
res->prefix[i] = php_strtr_hash(S(&res->patterns[i].pat), Bp);
2900+
}
28382901

2839-
found = 0;
2840-
memcpy(key, str+pos, maxlen);
2902+
/* Initialize the rest of ->hash */
2903+
for (i = 0; i < HASH_TAB_SIZE; i++) {
2904+
res->hash->entries[i] = -1;
2905+
}
2906+
{
2907+
HASH last_h = -1; /* assumes not all bits are used in res->hash */
2908+
/* res->patterns is already ordered by hash.
2909+
* Make res->hash->entries[h] de index of the first pattern in
2910+
* res->patterns that has hash h */
2911+
for (i = 0; i < patnum; i++) {
2912+
HASH h = php_strtr_hash(&S(&res->patterns[i].pat)[res->m - res->B], res->B)
2913+
& res->hash->table_mask;
2914+
if (h != last_h) {
2915+
res->hash->entries[h] = i;
2916+
last_h = h;
2917+
}
2918+
}
2919+
}
2920+
res->hash->entries[HASH_TAB_SIZE] = patnum;
2921+
for (i = HASH_TAB_SIZE - 1; i >= 0; i--) {
2922+
if (res->hash->entries[i] == -1) {
2923+
res->hash->entries[i] = res->hash->entries[i + 1];
2924+
}
2925+
}
28412926

2842-
for (len = maxlen; len >= minlen; len--) {
2843-
key[len] = 0;
2927+
res->patnum = patnum;
28442928

2845-
if (zend_hash_find(&tmp_hash, key, len+1, (void**)&trans) == SUCCESS) {
2846-
char *tval;
2847-
int tlen;
2848-
zval tmp;
2929+
return res;
2930+
}
2931+
/* }}} */
2932+
/* {{{ php_strtr_array_destroy_ppres(PPRES *d) */
2933+
static void php_strtr_array_destroy_ppres(PPRES *d)
2934+
{
2935+
efree(d->shift);
2936+
efree(d->hash);
2937+
efree(d->prefix);
2938+
efree(d->patterns);
2939+
efree(d);
2940+
}
2941+
/* }}} */
28492942

2850-
if (Z_TYPE_PP(trans) != IS_STRING) {
2851-
tmp = **trans;
2852-
zval_copy_ctor(&tmp);
2853-
convert_to_string(&tmp);
2854-
tval = Z_STRVAL(tmp);
2855-
tlen = Z_STRLEN(tmp);
2856-
} else {
2857-
tval = Z_STRVAL_PP(trans);
2858-
tlen = Z_STRLEN_PP(trans);
2859-
}
2943+
/* {{{ php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) */
2944+
static void php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value)
2945+
{
2946+
STRLEN pos = 0,
2947+
lastpos = L(text) - d->m;
2948+
smart_str result = {0};
28602949

2861-
smart_str_appendl(&result, tval, tlen);
2862-
pos += len;
2863-
found = 1;
2950+
while (pos <= lastpos) {
2951+
HASH h = php_strtr_hash(&S(text)[pos + d->m - d->B], d->B) & d->shift->table_mask;
2952+
STRLEN shift = d->shift->entries[h];
28642953

2865-
if (Z_TYPE_PP(trans) != IS_STRING) {
2866-
zval_dtor(&tmp);
2867-
}
2868-
break;
2954+
if (shift > 0) {
2955+
smart_str_appendl(&result, &S(text)[pos], shift);
2956+
pos += shift;
2957+
} else {
2958+
HASH h2 = h & d->hash->table_mask,
2959+
prefix_h = php_strtr_hash(&S(text)[pos], d->Bp);
2960+
2961+
int offset_start = d->hash->entries[h2],
2962+
offset_end = d->hash->entries[h2 + 1], /* exclusive */
2963+
i = 0;
2964+
2965+
for (i = offset_start; i < offset_end; i++) {
2966+
PATNREPL *pnr;
2967+
if (d->prefix[i] != prefix_h)
2968+
continue;
2969+
2970+
pnr = &d->patterns[i];
2971+
if (L(&pnr->pat) > L(text) - pos ||
2972+
memcmp(S(&pnr->pat), &S(text)[pos], L(&pnr->pat)) != 0)
2973+
continue;
2974+
2975+
smart_str_appendl(&result, S(&pnr->repl), (int)L(&pnr->repl));
2976+
pos += L(&pnr->pat);
2977+
goto end_outer_loop;
28692978
}
2979+
2980+
smart_str_appendc(&result, S(text)[pos]);
2981+
pos++;
2982+
end_outer_loop: ;
28702983
}
2984+
}
2985+
2986+
if (pos < L(text)) {
2987+
smart_str_appendl(&result, &S(text)[pos], (int)(L(text) - pos));
2988+
}
2989+
2990+
if (result.c != NULL) {
2991+
smart_str_0(&result);
2992+
RETVAL_STRINGL(result.c, result.len, 0);
2993+
} else {
2994+
RETURN_EMPTY_STRING();
2995+
}
2996+
}
2997+
/* }}} */
2998+
2999+
/* {{{ php_strtr_array */
3000+
static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *pats)
3001+
{
3002+
PPRES *data;
3003+
STR text;
3004+
PATNREPL *patterns;
3005+
HashPosition hpos;
3006+
zval **entry;
3007+
int num_pats = zend_hash_num_elements(pats),
3008+
i;
3009+
3010+
S(&text) = str;
3011+
L(&text) = slen;
3012+
patterns = safe_emalloc(num_pats, sizeof(*patterns), 0);
3013+
3014+
for (i = 0, zend_hash_internal_pointer_reset_ex(pats, &hpos);
3015+
zend_hash_get_current_data_ex(pats, (void **)&entry, &hpos) == SUCCESS;
3016+
i++, zend_hash_move_forward_ex(pats, &hpos)) {
3017+
char *string_key;
3018+
uint string_key_len;
3019+
ulong num_key;
3020+
int free_str = 0,
3021+
free_repl = 0;
3022+
zval *tzv;
3023+
3024+
switch (zend_hash_get_current_key_ex(pats, &string_key, &string_key_len, &num_key, 0, &hpos)) {
3025+
case HASH_KEY_IS_LONG:
3026+
string_key_len = 1 + zend_spprintf(&string_key, 0, "%ld", (long)num_key);
3027+
free_str = 1;
3028+
/* break missing intentionally */
3029+
3030+
case HASH_KEY_IS_STRING:
3031+
string_key_len--; /* exclude final '\0' */
3032+
if (string_key_len == 0) { /* empty string given as pattern */
3033+
efree(patterns);
3034+
RETURN_FALSE;
3035+
}
3036+
if (string_key_len > slen) { /* this pattern can never match */
3037+
continue;
3038+
}
3039+
3040+
if (Z_TYPE_PP(entry) != IS_STRING) {
3041+
tzv = *entry;
3042+
zval_addref_p(tzv);
3043+
SEPARATE_ZVAL(&tzv);
3044+
convert_to_string(tzv);
3045+
entry = &tzv;
3046+
free_repl = 1;
3047+
}
28713048

2872-
if (! found) {
2873-
smart_str_appendc(&result, str[pos++]);
3049+
S(&patterns[i].pat) = string_key;
3050+
L(&patterns[i].pat) = string_key_len;
3051+
S(&patterns[i].repl) = Z_STRVAL_PP(entry);
3052+
L(&patterns[i].repl) = Z_STRLEN_PP(entry);
28743053
}
28753054
}
28763055

2877-
efree(key);
2878-
zend_hash_destroy(&tmp_hash);
2879-
smart_str_0(&result);
2880-
RETVAL_STRINGL(result.c, result.len, 0);
3056+
data = php_strtr_array_prepare(&text, patterns, i, 2, 2);
3057+
efree(patterns);
3058+
php_strtr_array_do_repl(&text, data, return_value);
3059+
php_strtr_array_destroy_ppres(data);
28813060
}
28823061
/* }}} */
28833062

0 commit comments

Comments
 (0)
Failed to load comments.