Skip to content

WIP Modifying nmf.py for accepting mini-batches #13386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
117 changes: 117 additions & 0 deletions benchmarks/bench_minibatch_nmf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@

from time import time
import pandas as pd

from sklearn.decomposition.nmf import _beta_divergence
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.utils import gen_batches

from nmf import NMF
from nmf_original import NMFOriginal
from nmf_original import non_negative_factorization

import matplotlib.pyplot as plt

# Download file from:
# https://www.dropbox.com/s/n8ynmz6jxkynvyy/enwiki_1M_first_paragraphs.csv.zip?dl=0
# https://filesender.renater.fr/?s=download&token=88222d6d-5aee-c59b-4f34-c233b4d184e1
df = pd.read_csv('enwiki_1M_first_paragraphs.csv')
cats = df['0'].sample(frac=1, random_state=5).astype(str)
counter = HashingVectorizer(analyzer='word', ngram_range=(1, 1),
n_features=2**12, norm=None,
alternate_sign=False)
X = counter.fit_transform(cats)
n_components = 10
beta_loss = 'kullback-leibler'
n_train = 500000
n_test = 10000
batch_size = 10000
random_state = 12
n_batch = (n_train - 1) // batch_size + 1
X_test = X[:n_test, :]
X = X[n_test:n_train + n_test, :]

max_iter_nmf = [1, 5, 10, 30, 50, 100]
n_iter_minibatch_nmf = 50


def get_optimal_w(X, H):
W, _, _ = non_negative_factorization(
X=X, W=None, H=H,
n_components=n_components,
init='custom', update_H=False, solver='mu',
beta_loss=beta_loss, tol=1e-4, max_iter=200, alpha=0.,
l1_ratio=0., regularization=None, random_state=None,
verbose=0, shuffle=False)
return W


minibatch_nmf = NMF(
n_components=n_components, beta_loss=beta_loss, batch_size=batch_size,
solver='mu', random_state=random_state, max_iter=3)

fig, ax = plt.subplots()
plt.xscale('log')
fontsize = 16

total_time = 0
time_nmf = []
loss_nmf = []
for n_iter in range(n_iter_minibatch_nmf):

for j, slice in enumerate(gen_batches(n=n_train,
batch_size=batch_size)):
t0 = time()
minibatch_nmf.partial_fit(X[slice])
tf = time() - t0
total_time += tf
if ((j % 11 == 9) and (n_iter <= 1)) or j == n_batch - 1:
time_nmf.append(total_time)
W = get_optimal_w(X_test, minibatch_nmf.components_)
loss = _beta_divergence(X_test, W, minibatch_nmf.components_,
minibatch_nmf.beta_loss) / n_test
loss_nmf.append(loss)
plt.plot(time_nmf, loss_nmf, 'b', marker='o',
label='Mini-batch NMF')
plt.pause(.01)

print('Time MiniBatchNMF: %.1fs.' % total_time)
print('KL-div MiniBatchNMF: %.2f' % loss)
del W

total_time = 0
time_nmf = []
loss_nmf = []
for i, max_iter in enumerate(max_iter_nmf):
nmf = NMFOriginal(n_components=n_components, beta_loss=beta_loss,
solver='mu', max_iter=max_iter,
random_state=random_state, tol=0)
t0 = time()
nmf.fit(X)
tf = time() - t0
total_time += tf
time_nmf.append(total_time)
print('Time NMF: %.1fs.' % total_time)
W = get_optimal_w(X_test, nmf.components_)
loss = _beta_divergence(X_test, W, nmf.components_,
nmf.beta_loss) / n_test
loss_nmf.append(loss)
print('KL-div NMF: %.2f' % loss)
plt.plot(time_nmf, loss_nmf, 'r', marker='o', label='NMF')
plt.pause(.01)
del W

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles=(handles[-1], handles[0]),
labels=(labels[-1], labels[0]), fontsize=fontsize)
plt.tick_params(axis='both', which='major', labelsize=fontsize-2)
plt.xlabel('Time (seconds)', fontsize=fontsize)
plt.ylabel(beta_loss, fontsize=fontsize)
title = 'Wikipedia articles (first paragraph)'
ax.set_title(title, fontsize=fontsize+4)

figname = 'benchmark_nmf_wikipedia_articles.png'
print('Saving: ' + figname)
plt.savefig(figname,
transparent=False, bbox_inches='tight', pad_inches=0)
plt.show()
Loading