-
-
Save orausch/acf62e3fbf0ea5176f768d3dd3340ae1 to your computer and use it in GitHub Desktop.
Semisupervised on Newsgroups
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
11314 documents | |
20 categories | |
Supervised on 100% of the data | |
X_train length: 8485 | |
Unlabeled samples in train: 0 | |
0.9197596323789324 | |
---------- | |
Supervised on 20% of the data | |
X_train length: 1681 | |
Unlabeled samples in train: 0 | |
0.7840226228349241 | |
---------- | |
Self-trainin on 20% of the data | |
X_train length: 8485 | |
Unlabeled samples in train: 6804 | |
End of iteration 1, added 2760 new labels. | |
End of iteration 2, added 695 new labels. | |
End of iteration 3, added 233 new labels. | |
End of iteration 4, added 91 new labels. | |
End of iteration 5, added 41 new labels. | |
End of iteration 6, added 20 new labels. | |
End of iteration 7, added 17 new labels. | |
End of iteration 8, added 9 new labels. | |
End of iteration 9, added 5 new labels. | |
End of iteration 10, added 3 new labels. | |
0.8331565924354895 | |
---------- | |
LabelSpreading on 20% of the data | |
X_train length: 8485 | |
Unlabeled samples in train: 6804 | |
0.6422764227642277 | |
---------- | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.base import TransformerMixin | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import Pipeline | |
from sklearn.semi_supervised import SelfTrainingClassifier | |
from sklearn.semi_supervised import LabelSpreading | |
from sklearn.metrics import f1_score | |
data = fetch_20newsgroups(subset='train', categories=None) | |
print("%d documents" % len(data.filenames)) | |
print("%d categories" % len(data.target_names)) | |
print() | |
class DenseTransformer(TransformerMixin): | |
def fit(self, X, y=None, **fit_params): | |
return self | |
def transform(self, X, y=None, **fit_params): | |
return X.todense() | |
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log') | |
pipeline = Pipeline([ | |
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)), | |
('tfidf', TfidfTransformer()), | |
('clf', SGDClassifier(**sdg_params)), | |
]) | |
st_pipeline = Pipeline([ | |
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)), | |
('tfidf', TfidfTransformer()), | |
('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)), | |
]) | |
ls_pipeline = Pipeline([ | |
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)), | |
('tfidf', TfidfTransformer()), | |
('todense', DenseTransformer()), | |
('clf', LabelSpreading()), | |
]) | |
def print_eval_clf(clf, X_train, y_train, X_test, y_test): | |
print("X_train length:", len(X_train)) | |
print("Unlabeled samples in train:", sum(1 for x in y_train if x == -1)) | |
clf.fit(X_train, y_train) | |
y_pred = clf.predict(X_test) | |
scores = f1_score(y_test, y_pred, average='micro') | |
print(scores) | |
print("-" * 10) | |
print() | |
if __name__ == "__main__": | |
X, y = data.data, data.target | |
X_train, X_test, y_train, y_test = train_test_split(X, y) | |
print("Supervised on 100% of the data") | |
print_eval_clf(pipeline, X_train, y_train, X_test, y_test) | |
y_mask = np.random.rand(len(y_train)) < 0.2 | |
X_20, y_20 = map(list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m))) | |
print("Supervised on 20% of the data") | |
print_eval_clf(pipeline, X_20, y_20, X_test, y_test) | |
y_train[~y_mask] = -1 | |
print("Self-trainin on 20% of the data") | |
print_eval_clf(st_pipeline, X_train, y_train, X_test, y_test) | |
print("LabelSpreading on 20% of the data") | |
print_eval_clf(ls_pipeline, X_train, y_train, X_test, y_test) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment