72
72
73
73
def generate_data (case ):
74
74
"""Generate regression/classification data."""
75
- if case == ' regression' :
75
+ if case == " regression" :
76
76
X , y = datasets .load_diabetes (return_X_y = True )
77
- elif case == 'classification' :
78
- X , y = datasets .fetch_20newsgroups_vectorized (subset = 'all' ,
79
- return_X_y = True )
77
+ elif case == "classification" :
78
+ X , y = datasets .fetch_20newsgroups_vectorized (subset = "all" , return_X_y = True )
80
79
X , y = shuffle (X , y )
81
80
offset = int (X .shape [0 ] * 0.8 )
82
81
X_train , y_train = X [:offset ], y [:offset ]
83
82
X_test , y_test = X [offset :], y [offset :]
84
83
85
- data = {'X_train' : X_train , 'X_test' : X_test , 'y_train' : y_train ,
86
- 'y_test' : y_test }
84
+ data = {"X_train" : X_train , "X_test" : X_test , "y_train" : y_train , "y_test" : y_test }
87
85
return data
88
86
89
87
90
- regression_data = generate_data (' regression' )
91
- classification_data = generate_data (' classification' )
88
+ regression_data = generate_data (" regression" )
89
+ classification_data = generate_data (" classification" )
92
90
93
91
94
92
##############################################################################
@@ -110,26 +108,33 @@ def benchmark_influence(conf):
110
108
prediction_times = []
111
109
prediction_powers = []
112
110
complexities = []
113
- for param_value in conf [' changing_param_values' ]:
114
- conf [' tuned_params' ][conf [' changing_param' ]] = param_value
115
- estimator = conf [' estimator' ](** conf [' tuned_params' ])
111
+ for param_value in conf [" changing_param_values" ]:
112
+ conf [" tuned_params" ][conf [" changing_param" ]] = param_value
113
+ estimator = conf [" estimator" ](** conf [" tuned_params" ])
116
114
117
115
print ("Benchmarking %s" % estimator )
118
- estimator .fit (conf [' data' ][ ' X_train' ], conf [' data' ][ ' y_train' ])
119
- conf [' postfit_hook' ](estimator )
120
- complexity = conf [' complexity_computer' ](estimator )
116
+ estimator .fit (conf [" data" ][ " X_train" ], conf [" data" ][ " y_train" ])
117
+ conf [" postfit_hook" ](estimator )
118
+ complexity = conf [" complexity_computer" ](estimator )
121
119
complexities .append (complexity )
122
120
start_time = time .time ()
123
- for _ in range (conf [' n_samples' ]):
124
- y_pred = estimator .predict (conf [' data' ][ ' X_test' ])
125
- elapsed_time = (time .time () - start_time ) / float (conf [' n_samples' ])
121
+ for _ in range (conf [" n_samples" ]):
122
+ y_pred = estimator .predict (conf [" data" ][ " X_test" ])
123
+ elapsed_time = (time .time () - start_time ) / float (conf [" n_samples" ])
126
124
prediction_times .append (elapsed_time )
127
- pred_score = conf ['prediction_performance_computer' ](
128
- conf ['data' ]['y_test' ], y_pred )
125
+ pred_score = conf ["prediction_performance_computer" ](
126
+ conf ["data" ]["y_test" ], y_pred
127
+ )
129
128
prediction_powers .append (pred_score )
130
- print ("Complexity: %d | %s: %.4f | Pred. Time: %fs\n " % (
131
- complexity , conf ['prediction_performance_label' ], pred_score ,
132
- elapsed_time ))
129
+ print (
130
+ "Complexity: %d | %s: %.4f | Pred. Time: %fs\n "
131
+ % (
132
+ complexity ,
133
+ conf ["prediction_performance_label" ],
134
+ pred_score ,
135
+ elapsed_time ,
136
+ )
137
+ )
133
138
return prediction_powers , prediction_times , complexities
134
139
135
140
@@ -147,46 +152,58 @@ def benchmark_influence(conf):
147
152
# different data.
148
153
#
149
154
155
+
150
156
def _count_nonzero_coefficients (estimator ):
151
157
a = estimator .coef_ .toarray ()
152
158
return np .count_nonzero (a )
153
159
154
160
155
161
configurations = [
156
- {'estimator' : SGDClassifier ,
157
- 'tuned_params' : {'penalty' : 'elasticnet' , 'alpha' : 0.001 , 'loss' :
158
- 'modified_huber' , 'fit_intercept' : True , 'tol' : 1e-3 },
159
- 'changing_param' : 'l1_ratio' ,
160
- 'changing_param_values' : [0.25 , 0.5 , 0.75 , 0.9 ],
161
- 'complexity_label' : 'non_zero coefficients' ,
162
- 'complexity_computer' : _count_nonzero_coefficients ,
163
- 'prediction_performance_computer' : hamming_loss ,
164
- 'prediction_performance_label' : 'Hamming Loss (Misclassification Ratio)' ,
165
- 'postfit_hook' : lambda x : x .sparsify (),
166
- 'data' : classification_data ,
167
- 'n_samples' : 30 },
168
- {'estimator' : NuSVR ,
169
- 'tuned_params' : {'C' : 1e3 , 'gamma' : 2 ** - 15 },
170
- 'changing_param' : 'nu' ,
171
- 'changing_param_values' : [0.1 , 0.25 , 0.5 , 0.75 , 0.9 ],
172
- 'complexity_label' : 'n_support_vectors' ,
173
- 'complexity_computer' : lambda x : len (x .support_vectors_ ),
174
- 'data' : regression_data ,
175
- 'postfit_hook' : lambda x : x ,
176
- 'prediction_performance_computer' : mean_squared_error ,
177
- 'prediction_performance_label' : 'MSE' ,
178
- 'n_samples' : 30 },
179
- {'estimator' : GradientBoostingRegressor ,
180
- 'tuned_params' : {'loss' : 'squared_error' },
181
- 'changing_param' : 'n_estimators' ,
182
- 'changing_param_values' : [10 , 50 , 100 , 200 , 500 ],
183
- 'complexity_label' : 'n_trees' ,
184
- 'complexity_computer' : lambda x : x .n_estimators ,
185
- 'data' : regression_data ,
186
- 'postfit_hook' : lambda x : x ,
187
- 'prediction_performance_computer' : mean_squared_error ,
188
- 'prediction_performance_label' : 'MSE' ,
189
- 'n_samples' : 30 },
162
+ {
163
+ "estimator" : SGDClassifier ,
164
+ "tuned_params" : {
165
+ "penalty" : "elasticnet" ,
166
+ "alpha" : 0.001 ,
167
+ "loss" : "modified_huber" ,
168
+ "fit_intercept" : True ,
169
+ "tol" : 1e-3 ,
170
+ },
171
+ "changing_param" : "l1_ratio" ,
172
+ "changing_param_values" : [0.25 , 0.5 , 0.75 , 0.9 ],
173
+ "complexity_label" : "non_zero coefficients" ,
174
+ "complexity_computer" : _count_nonzero_coefficients ,
175
+ "prediction_performance_computer" : hamming_loss ,
176
+ "prediction_performance_label" : "Hamming Loss (Misclassification Ratio)" ,
177
+ "postfit_hook" : lambda x : x .sparsify (),
178
+ "data" : classification_data ,
179
+ "n_samples" : 30 ,
180
+ },
181
+ {
182
+ "estimator" : NuSVR ,
183
+ "tuned_params" : {"C" : 1e3 , "gamma" : 2 ** - 15 },
184
+ "changing_param" : "nu" ,
185
+ "changing_param_values" : [0.1 , 0.25 , 0.5 , 0.75 , 0.9 ],
186
+ "complexity_label" : "n_support_vectors" ,
187
+ "complexity_computer" : lambda x : len (x .support_vectors_ ),
188
+ "data" : regression_data ,
189
+ "postfit_hook" : lambda x : x ,
190
+ "prediction_performance_computer" : mean_squared_error ,
191
+ "prediction_performance_label" : "MSE" ,
192
+ "n_samples" : 30 ,
193
+ },
194
+ {
195
+ "estimator" : GradientBoostingRegressor ,
196
+ "tuned_params" : {"loss" : "squared_error" },
197
+ "changing_param" : "n_estimators" ,
198
+ "changing_param_values" : [10 , 50 , 100 , 200 , 500 ],
199
+ "complexity_label" : "n_trees" ,
200
+ "complexity_computer" : lambda x : x .n_estimators ,
201
+ "data" : regression_data ,
202
+ "postfit_hook" : lambda x : x ,
203
+ "prediction_performance_computer" : mean_squared_error ,
204
+ "prediction_performance_label" : "MSE" ,
205
+ "n_samples" : 30 ,
206
+ },
190
207
]
191
208
192
209
@@ -209,6 +226,7 @@ def _count_nonzero_coefficients(estimator):
209
226
# ensemble is not as detrimental.
210
227
#
211
228
229
+
212
230
def plot_influence (conf , mse_values , prediction_times , complexities ):
213
231
"""
214
232
Plot influence of model complexity on both accuracy and latency.
@@ -219,38 +237,37 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
219
237
220
238
# first axes (prediction error)
221
239
ax1 = fig .add_subplot (111 )
222
- line1 = ax1 .plot (complexities , mse_values , c = ' tab:blue' , ls = '-' )[0 ]
223
- ax1 .set_xlabel (' Model Complexity (%s)' % conf [' complexity_label' ])
224
- y1_label = conf [' prediction_performance_label' ]
240
+ line1 = ax1 .plot (complexities , mse_values , c = " tab:blue" , ls = "-" )[0 ]
241
+ ax1 .set_xlabel (" Model Complexity (%s)" % conf [" complexity_label" ])
242
+ y1_label = conf [" prediction_performance_label" ]
225
243
ax1 .set_ylabel (y1_label )
226
244
227
- ax1 .spines [' left' ].set_color (line1 .get_color ())
245
+ ax1 .spines [" left" ].set_color (line1 .get_color ())
228
246
ax1 .yaxis .label .set_color (line1 .get_color ())
229
- ax1 .tick_params (axis = 'y' , colors = line1 .get_color ())
247
+ ax1 .tick_params (axis = "y" , colors = line1 .get_color ())
230
248
231
249
# second axes (latency)
232
250
ax2 = fig .add_subplot (111 , sharex = ax1 , frameon = False )
233
- line2 = ax2 .plot (complexities , prediction_times , c = ' tab:orange' , ls = '-' )[0 ]
251
+ line2 = ax2 .plot (complexities , prediction_times , c = " tab:orange" , ls = "-" )[0 ]
234
252
ax2 .yaxis .tick_right ()
235
253
ax2 .yaxis .set_label_position ("right" )
236
254
y2_label = "Time (s)"
237
255
ax2 .set_ylabel (y2_label )
238
- ax1 .spines [' right' ].set_color (line2 .get_color ())
256
+ ax1 .spines [" right" ].set_color (line2 .get_color ())
239
257
ax2 .yaxis .label .set_color (line2 .get_color ())
240
- ax2 .tick_params (axis = 'y' , colors = line2 .get_color ())
258
+ ax2 .tick_params (axis = "y" , colors = line2 .get_color ())
241
259
242
- plt .legend ((line1 , line2 ), ("prediction error" , "latency" ),
243
- loc = 'upper right' )
260
+ plt .legend ((line1 , line2 ), ("prediction error" , "latency" ), loc = "upper right" )
244
261
245
- plt .title ("Influence of varying '%s' on %s" % (conf ['changing_param' ],
246
- conf ['estimator' ].__name__ ))
262
+ plt .title (
263
+ "Influence of varying '%s' on %s"
264
+ % (conf ["changing_param" ], conf ["estimator" ].__name__ )
265
+ )
247
266
248
267
249
268
for conf in configurations :
250
- prediction_performances , prediction_times , complexities = \
251
- benchmark_influence (conf )
252
- plot_influence (conf , prediction_performances , prediction_times ,
253
- complexities )
269
+ prediction_performances , prediction_times , complexities = benchmark_influence (conf )
270
+ plot_influence (conf , prediction_performances , prediction_times , complexities )
254
271
plt .show ()
255
272
256
273
0 commit comments