Skip to content

Commit 89997e7

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 228109cd5c12c0c3374e37f13b48d7382d15a5a7
1 parent 8258e8a commit 89997e7

File tree

1,170 files changed

+3647
-3647
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,170 files changed

+3647
-3647
lines changed

dev/_downloads/1abc4484d4183963e2039c8c679497eb/plot_sgd_comparison.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Rob Zinkov <rob at zinkov dot com>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import SGDClassifier, Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import LogisticRegression\n\nheldout = [0.95, 0.90, 0.75, 0.50, 0.01]\nrounds = 20\nX, y = datasets.load_digits(return_X_y=True)\n\nclassifiers = [\n (\"SGD\", SGDClassifier(max_iter=100)),\n (\"ASGD\", SGDClassifier(average=True, max_iter=1000)),\n (\"Perceptron\", Perceptron(tol=1e-3)),\n (\"Passive-Aggressive I\", PassiveAggressiveClassifier(loss='hinge',\n C=1.0, tol=1e-4)),\n (\"Passive-Aggressive II\", PassiveAggressiveClassifier(loss='squared_hinge',\n C=1.0, tol=1e-4)),\n (\"SAG\", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))\n]\n\nxx = 1. - np.array(heldout)\n\nfor name, clf in classifiers:\n print(\"training %s\" % name)\n rng = np.random.RandomState(42)\n yy = []\n for i in heldout:\n yy_ = []\n for r in range(rounds):\n X_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=i, random_state=rng)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n yy_.append(1 - np.mean(y_pred == y_test))\n yy.append(np.mean(yy_))\n plt.plot(xx, yy, label=name)\n\nplt.legend(loc=\"upper right\")\nplt.xlabel(\"Proportion train\")\nplt.ylabel(\"Test Error Rate\")\nplt.show()"
29+
"# Author: Rob Zinkov <rob at zinkov dot com>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import SGDClassifier, Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import LogisticRegression\n\nheldout = [0.95, 0.90, 0.75, 0.50, 0.01]\nrounds = 20\nX, y = datasets.load_digits(return_X_y=True)\n\nclassifiers = [\n (\"SGD\", SGDClassifier(max_iter=100)),\n (\"ASGD\", SGDClassifier(average=True)),\n (\"Perceptron\", Perceptron()),\n (\"Passive-Aggressive I\", PassiveAggressiveClassifier(loss='hinge',\n C=1.0, tol=1e-4)),\n (\"Passive-Aggressive II\", PassiveAggressiveClassifier(loss='squared_hinge',\n C=1.0, tol=1e-4)),\n (\"SAG\", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))\n]\n\nxx = 1. - np.array(heldout)\n\nfor name, clf in classifiers:\n print(\"training %s\" % name)\n rng = np.random.RandomState(42)\n yy = []\n for i in heldout:\n yy_ = []\n for r in range(rounds):\n X_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=i, random_state=rng)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n yy_.append(1 - np.mean(y_pred == y_test))\n yy.append(np.mean(yy_))\n plt.plot(xx, yy, label=name)\n\nplt.legend(loc=\"upper right\")\nplt.xlabel(\"Proportion train\")\nplt.ylabel(\"Test Error Rate\")\nplt.show()"
3030
]
3131
}
3232
],
Binary file not shown.

dev/_downloads/3650884f0a646ba96d2e47df0a6fb935/plot_sgd_comparison.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525

2626
classifiers = [
2727
("SGD", SGDClassifier(max_iter=100)),
28-
("ASGD", SGDClassifier(average=True, max_iter=1000)),
29-
("Perceptron", Perceptron(tol=1e-3)),
28+
("ASGD", SGDClassifier(average=True)),
29+
("Perceptron", Perceptron()),
3030
("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge',
3131
C=1.0, tol=1e-4)),
3232
("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge',

dev/_downloads/388641d133587cc11aa26f2dbef4b950/plot_document_classification_20newsgroups.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def benchmark(clf):
247247
results = []
248248
for clf, name in (
249249
(RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
250-
(Perceptron(max_iter=50, tol=1e-3), "Perceptron"),
250+
(Perceptron(max_iter=50), "Perceptron"),
251251
(PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
252252
"Passive-Aggressive"),
253253
(KNeighborsClassifier(n_neighbors=10), "kNN"),

dev/_downloads/3b31bf37034a6ece04667cd422e5ff79/plot_document_classification_20newsgroups.ipynb

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dev/_downloads/80692cf167e9ea27b27e5bd144159c82/plot_out_of_core_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def progress(blocknum, bs, size):
208208
# Here are some classifiers that support the `partial_fit` method
209209
partial_fit_classifiers = {
210210
'SGD': SGDClassifier(max_iter=5),
211-
'Perceptron': Perceptron(tol=1e-3),
211+
'Perceptron': Perceptron(),
212212
'NB Multinomial': MultinomialNB(alpha=0.01),
213213
'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),
214214
}

dev/_downloads/b86db3a111b621a7beeaa9d099608e5b/plot_out_of_core_classification.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
},
6363
"outputs": [],
6464
"source": [
65-
"vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(tol=1e-3),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
65+
"vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(**doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
6666
]
6767
},
6868
{
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

3.11 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
157 Bytes
92 Bytes
-14 Bytes
150 Bytes
47 Bytes
47 Bytes
23 Bytes
374 Bytes
374 Bytes
-490 Bytes
-85 Bytes
-345 Bytes
328 Bytes
349 Bytes
5.21 KB
581 Bytes
238 Bytes
238 Bytes
-67 Bytes
-67 Bytes
-211 Bytes
-211 Bytes
128 Bytes
128 Bytes
-43 Bytes
-43 Bytes
82 Bytes
82 Bytes
-144 Bytes
57 Bytes
57 Bytes
-112 Bytes
777 Bytes
-45 Bytes
9 Bytes

dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt

Lines changed: 17 additions & 17 deletions

0 commit comments

Comments
 (0)