Skip to content

Commit edabec2

Browse files
rthlesteve
authored andcommitted
[MRG+1] Add text vectorizers benchmarks (#9086)
1 parent c5c51c6 commit edabec2

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

benchmarks/bench_text_vectorizers.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
3+
To run this benchmark, you will need,
4+
5+
* scikit-learn
6+
* pandas
7+
* memory_profiler
8+
* psutil (optional, but recommended)
9+
10+
"""
11+
12+
from __future__ import print_function
13+
14+
import timeit
15+
import itertools
16+
17+
import numpy as np
18+
import pandas as pd
19+
from memory_profiler import memory_usage
20+
21+
from sklearn.datasets import fetch_20newsgroups
22+
from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer,
23+
HashingVectorizer)
24+
25+
n_repeat = 3
26+
27+
28+
def run_vectorizer(Vectorizer, X, **params):
29+
def f():
30+
vect = Vectorizer(**params)
31+
vect.fit_transform(X)
32+
return f
33+
34+
35+
text = fetch_20newsgroups(subset='train').data
36+
37+
print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n')
38+
print("Using a subset of the 20 newsrgoups dataset ({} documents)."
39+
.format(len(text)))
40+
print("This benchmarks runs in ~20 min ...")
41+
42+
res = []
43+
44+
for Vectorizer, (analyzer, ngram_range) in itertools.product(
45+
[CountVectorizer, TfidfVectorizer, HashingVectorizer],
46+
[('word', (1, 1)),
47+
('word', (1, 2)),
48+
('word', (1, 4)),
49+
('char', (4, 4)),
50+
('char_wb', (4, 4))
51+
]):
52+
53+
bench = {'vectorizer': Vectorizer.__name__}
54+
params = {'analyzer': analyzer, 'ngram_range': ngram_range}
55+
bench.update(params)
56+
dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params),
57+
number=1,
58+
repeat=n_repeat)
59+
bench['time'] = "{:.2f} (+-{:.2f})".format(np.mean(dt), np.std(dt))
60+
61+
mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params))
62+
63+
bench['memory'] = "{:.1f}".format(np.max(mem_usage))
64+
65+
res.append(bench)
66+
67+
68+
df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer'])
69+
70+
print('\n========== Run time performance (sec) ===========\n')
71+
print('Computing the mean and the standard deviation '
72+
'of the run time over {} runs...\n'.format(n_repeat))
73+
print(df['time'].unstack(level=-1))
74+
75+
print('\n=============== Memory usage (MB) ===============\n')
76+
print(df['memory'].unstack(level=-1))

0 commit comments

Comments
 (0)