Skip to content

Instantly share code, notes, and snippets.

@orausch
Created November 18, 2020 11:45
Show Gist options
  • Save orausch/acf62e3fbf0ea5176f768d3dd3340ae1 to your computer and use it in GitHub Desktop.
Save orausch/acf62e3fbf0ea5176f768d3dd3340ae1 to your computer and use it in GitHub Desktop.
Semisupervised on Newsgroups
11314 documents
20 categories
Supervised on 100% of the data
X_train length: 8485
Unlabeled samples in train: 0
0.9197596323789324
----------
Supervised on 20% of the data
X_train length: 1681
Unlabeled samples in train: 0
0.7840226228349241
----------
Self-trainin on 20% of the data
X_train length: 8485
Unlabeled samples in train: 6804
End of iteration 1, added 2760 new labels.
End of iteration 2, added 695 new labels.
End of iteration 3, added 233 new labels.
End of iteration 4, added 91 new labels.
End of iteration 5, added 41 new labels.
End of iteration 6, added 20 new labels.
End of iteration 7, added 17 new labels.
End of iteration 8, added 9 new labels.
End of iteration 9, added 5 new labels.
End of iteration 10, added 3 new labels.
0.8331565924354895
----------
LabelSpreading on 20% of the data
X_train length: 8485
Unlabeled samples in train: 6804
0.6422764227642277
----------
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score
data = fetch_20newsgroups(subset='train', categories=None)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()
class DenseTransformer(TransformerMixin):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X.todense()
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
pipeline = Pipeline([
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(**sdg_params)),
])
st_pipeline = Pipeline([
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)),
('tfidf', TfidfTransformer()),
('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
])
ls_pipeline = Pipeline([
('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8)),
('tfidf', TfidfTransformer()),
('todense', DenseTransformer()),
('clf', LabelSpreading()),
])
def print_eval_clf(clf, X_train, y_train, X_test, y_test):
print("X_train length:", len(X_train))
print("Unlabeled samples in train:", sum(1 for x in y_train if x == -1))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores = f1_score(y_test, y_pred, average='micro')
print(scores)
print("-" * 10)
print()
if __name__ == "__main__":
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("Supervised on 100% of the data")
print_eval_clf(pipeline, X_train, y_train, X_test, y_test)
y_mask = np.random.rand(len(y_train)) < 0.2
X_20, y_20 = map(list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m)))
print("Supervised on 20% of the data")
print_eval_clf(pipeline, X_20, y_20, X_test, y_test)
y_train[~y_mask] = -1
print("Self-trainin on 20% of the data")
print_eval_clf(st_pipeline, X_train, y_train, X_test, y_test)
print("LabelSpreading on 20% of the data")
print_eval_clf(ls_pipeline, X_train, y_train, X_test, y_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment