From e15dcb4cbc79c99234fdde140369fbde16c3e6e7 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 10 Apr 2020 00:14:20 +0800 Subject: [PATCH 1/5] scorecard demo --- sqlflow_models/__init__.py | 1 + sqlflow_models/score_card.py | 115 +++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 sqlflow_models/score_card.py diff --git a/sqlflow_models/__init__.py b/sqlflow_models/__init__.py index 0cbe218..711e6e7 100755 --- a/sqlflow_models/__init__.py +++ b/sqlflow_models/__init__.py @@ -6,3 +6,4 @@ from .dnnclassifier_functional_api_example import dnnclassifier_functional_model from .lstm_based_time_series_beta0 import LSTMBasedTimeSeriesModel from .auto_estimator import AutoClassifier, AutoRegressor +from .score_card import MyScoreCard \ No newline at end of file diff --git a/sqlflow_models/score_card.py b/sqlflow_models/score_card.py new file mode 100644 index 0000000..f48895d --- /dev/null +++ b/sqlflow_models/score_card.py @@ -0,0 +1,115 @@ +#!/bin/env python + +import tensorflow as tf +from tensorflow import keras +from tensorflow.python.data import make_one_shot_iterator +from tensorflow.keras.losses import kld +from tensorflow.keras.optimizers import SGD +import numpy as np +import pandas as pd +import scipy.stats.stats as stats +import sklearn +from sklearn.linear_model import LogisticRegression + + +def optimizer(): + return SGD(lr=0.1, momentum=0.9) + +def loss(): + return None + +class MyScoreCard(keras.Model): + + def __init__(self, feature_columns=None): + super(MyScoreCard, self).__init__(name='ScoreCard') + + self._factor = 20/np.log(2) + self._offset = 600 - 20*np.log(20) / np.log(2) + + def call(self): + pass + + def _mono_bin(self, y, x, n=10): + # population frequency + r = 0 + bad_num = y.sum() + good_num = y.count() - y.sum() + d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.qcut(x,n, duplicates='drop')}) + d2 = d1.groupby('bucket',as_index=True) + d3 = pd.DataFrame(d2.x.min(),columns=['min_bin']) + + d3["min"] = d2.min().x + d3["max"] = d2.max().x + d3["badcostum"] = d2.sum().y + d3["goodcostum"] = d2.count().y - d2.sum().y + d3["total"] = d2.count().y + d3["bad_rate"] = d2.sum().y/d2.count().y + d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"] * good_num/ bad_num) + iv = ((d3["badcostum"]/bad_num - d3["goodcostum"]/good_num)*d3["woe"]) + d3["iv"] = iv + woe = list(d3["woe"].round(6)) + cut = list(d3["max"].round(6)) + cut.insert(0, float("-inf")) + cut[-1] = float("inf") + return d3, cut, woe, iv + + def _to_dataframe(self, dataset): + x_df = pd.DataFrame() + y_df = pd.DataFrame() + + for features, label in dataset: + dx = {} + dy = {} + for name, value in features.items(): + dx[name] = value.numpy()[0] + dy['label'] = label.numpy()[0][0] + x_df = x_df.append(dx, ignore_index=True) + y_df = y_df.append(dy, ignore_index=True) + return x_df, y_df + + def _replace_woe(self, x, cut, woe): + return pd.cut(x, cut, labels=pd.Categorical(woe)) + + def _calsumscore(self, woe_list, coe): + n = coe.shape[1] + serise = 0 + for i in range(n): + serise += coe[0][i] * np.array(woe_list.iloc[:, i]) + score = serise * self._factor + self._offset + return score + def _get_score(self, coe, woe): + scores = [] + for w in woe: + scores.append(round(coe * w * self._factor, 0)) + return scores + + def sqlflow_train_loop(self, x, epochs=1, verbose=0): + ite = make_one_shot_iterator(x) + ite.get_next() + + x_df, y_df = self._to_dataframe(x) + + x_train_dict = {} + woe_dict = {} + for col in x_df.columns: + if col in ['id', 'number_of_dependents']: + continue + fx1, cut1, x1_woe, iv1 = self._mono_bin(y_df['label'], x_df[col]) + woe_dict[col] = fx1 + x_replaced_woe = self._replace_woe(x_df[col], cut1, x1_woe) + x_train_dict[col] = x_replaced_woe + x_train = pd.DataFrame.from_dict(x_train_dict) + clf = LogisticRegression() + clf.fit(x_train, y_df['label']) + clf.predict_proba(x_train) + coe = clf.coef_ + scores = self._calsumscore(x_train, coe) + col_i = -1 + for i in range(len(x_df.columns)): + col_name = x_df.columns[i] + if col_name in ['id', 'number_of_dependents']: + continue + col_i += 1 + col_woe = woe_dict[col_name] + for j, w in enumerate(col_woe['woe']): + print(col_name, col_woe['woe'].index.to_list()[j], round(coe[0][col_i] * w * self._factor, 0)) From b9e43ea3f6df991716691485e41bdbb17582214e Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 10 Apr 2020 22:48:37 +0800 Subject: [PATCH 2/5] update --- sqlflow_models/score_card.py | 89 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/sqlflow_models/score_card.py b/sqlflow_models/score_card.py index f48895d..000f8d4 100644 --- a/sqlflow_models/score_card.py +++ b/sqlflow_models/score_card.py @@ -10,7 +10,7 @@ import scipy.stats.stats as stats import sklearn from sklearn.linear_model import LogisticRegression - +import pickle def optimizer(): return SGD(lr=0.1, momentum=0.9) @@ -25,16 +25,18 @@ def __init__(self, feature_columns=None): self._factor = 20/np.log(2) self._offset = 600 - 20*np.log(20) / np.log(2) + self._lr = LogisticRegression() + self._is_first_predict_batch = False + self._bins = dict() def call(self): pass - def _mono_bin(self, y, x, n=10): + def _pr_bin(self, y, x, n=10): # population frequency - r = 0 bad_num = y.sum() good_num = y.count() - y.sum() - d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.qcut(x,n, duplicates='drop')}) + d1 = pd.DataFrame({'x': x,'y': y,'bucket': pd.qcut(x, n, duplicates='drop')}) d2 = d1.groupby('bucket',as_index=True) d3 = pd.DataFrame(d2.x.min(),columns=['min_bin']) @@ -56,32 +58,21 @@ def _mono_bin(self, y, x, n=10): def _to_dataframe(self, dataset): x_df = pd.DataFrame() y_df = pd.DataFrame() - + for features, label in dataset: dx = {} dy = {} for name, value in features.items(): dx[name] = value.numpy()[0] - dy['label'] = label.numpy()[0][0] x_df = x_df.append(dx, ignore_index=True) - y_df = y_df.append(dy, ignore_index=True) + if label is not None: + dy['label'] = label.numpy()[0][0] + y_df = y_df.append(dy, ignore_index=True) return x_df, y_df def _replace_woe(self, x, cut, woe): - return pd.cut(x, cut, labels=pd.Categorical(woe)) - - def _calsumscore(self, woe_list, coe): - n = coe.shape[1] - serise = 0 - for i in range(n): - serise += coe[0][i] * np.array(woe_list.iloc[:, i]) - score = serise * self._factor + self._offset - return score - def _get_score(self, coe, woe): - scores = [] - for w in woe: - scores.append(round(coe * w * self._factor, 0)) - return scores + #return pd.cut(x, cut, labels=pd.Categorical(woe)) + return pd.cut(x, cut, labels=woe) def sqlflow_train_loop(self, x, epochs=1, verbose=0): ite = make_one_shot_iterator(x) @@ -90,26 +81,42 @@ def sqlflow_train_loop(self, x, epochs=1, verbose=0): x_df, y_df = self._to_dataframe(x) x_train_dict = {} - woe_dict = {} for col in x_df.columns: - if col in ['id', 'number_of_dependents']: - continue - fx1, cut1, x1_woe, iv1 = self._mono_bin(y_df['label'], x_df[col]) - woe_dict[col] = fx1 - x_replaced_woe = self._replace_woe(x_df[col], cut1, x1_woe) + dfx, cut, woe, iv = self._pr_bin(y_df['label'], x_df[col]) + self._bins[col] = (dfx, cut, woe, iv) + + x_replaced_woe = self._replace_woe(x_df[col], cut, woe) x_train_dict[col] = x_replaced_woe + x_train = pd.DataFrame.from_dict(x_train_dict) - clf = LogisticRegression() - clf.fit(x_train, y_df['label']) - clf.predict_proba(x_train) - coe = clf.coef_ - scores = self._calsumscore(x_train, coe) - col_i = -1 - for i in range(len(x_df.columns)): - col_name = x_df.columns[i] - if col_name in ['id', 'number_of_dependents']: - continue - col_i += 1 - col_woe = woe_dict[col_name] - for j, w in enumerate(col_woe['woe']): - print(col_name, col_woe['woe'].index.to_list()[j], round(coe[0][col_i] * w * self._factor, 0)) + self._lr.fit(x_train, y_df['label']) + coe = self._lr.coef_ + + for i, col_name in enumerate(x_df.columns): + bin_cols = self._bins[col_name][0].index.to_list() + for j, w in enumerate(self._bins[col_name][2]): + print(col_name, bin_cols[j], round(coe[0][i] * w * self._factor, 0)) + + def save_weights(self, save="", save_format="h5"): + pickle.dump(self._lr, open(save, 'wb')) + pickle.dump(self._bins, open(save+"_bin", 'wb')) + + def load_weights(self, save): + self._lr = pickle.load(open(save, 'rb')) + self._bins = pickle.load(open(save+"_bin", 'rb')) + + def predict_on_batch(self, features): + if not self._is_first_predict_batch: + self._is_first_predict_batch = True + return None + x_df, _ = self._to_dataframe([(features, None)]) + x_train_dict = {} + for col in x_df.columns: + bin = self._bins[col] + x_train_dict[col] = self._replace_woe(x_df[col], bin[1], bin[2]) + r = self._lr.predict_proba(pd.DataFrame.from_dict(x_train_dict)) + return r + +def prepare_prediction_column(prediction): + """Return the class label of highest probability.""" + return prediction.argmax(axis=-1) From 53b61fbc96fe62167a17fc5f73f0486e19f053de Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Sun, 12 Apr 2020 19:10:16 +0800 Subject: [PATCH 3/5] caculate AUC --- sqlflow_models/score_card.py | 69 +++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/sqlflow_models/score_card.py b/sqlflow_models/score_card.py index 000f8d4..6082f53 100644 --- a/sqlflow_models/score_card.py +++ b/sqlflow_models/score_card.py @@ -10,14 +10,25 @@ import scipy.stats.stats as stats import sklearn from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_auc_score, auc import pickle + def optimizer(): + # SGD is just a placeholder to avoid panic on SQLFLow traning return SGD(lr=0.1, momentum=0.9) + def loss(): return None + +def prepare_prediction_column(prediction): + """Return the class label of highest probability.""" + return prediction.argmax(axis=-1) + + class MyScoreCard(keras.Model): def __init__(self, feature_columns=None): @@ -25,15 +36,14 @@ def __init__(self, feature_columns=None): self._factor = 20/np.log(2) self._offset = 600 - 20*np.log(20) / np.log(2) - self._lr = LogisticRegression() self._is_first_predict_batch = False self._bins = dict() def call(self): pass - def _pr_bin(self, y, x, n=10): - # population frequency + def _pf_bin(self, y, x, n=10): + # population frequency bucket bad_num = y.sum() good_num = y.count() - y.sum() d1 = pd.DataFrame({'x': x,'y': y,'bucket': pd.qcut(x, n, duplicates='drop')}) @@ -68,30 +78,43 @@ def _to_dataframe(self, dataset): if label is not None: dy['label'] = label.numpy()[0][0] y_df = y_df.append(dy, ignore_index=True) - return x_df, y_df + + if y_df.empty: + return x_df, None + return x_df, y_df['label'] def _replace_woe(self, x, cut, woe): - #return pd.cut(x, cut, labels=pd.Categorical(woe)) - return pd.cut(x, cut, labels=woe) + return pd.cut(x, cut, labels=pd.Categorical(woe)) + + def _woe_encoder(self, x, y): + x_train_dict = {} + for col in x.columns: + dfx, cut, woe, iv = self._pf_bin(y, x[col]) + self._bins[col] = (dfx, cut, woe, iv) + # replace by woe encoder + x_train_dict[col] = self._replace_woe(x[col], cut, woe) - def sqlflow_train_loop(self, x, epochs=1, verbose=0): - ite = make_one_shot_iterator(x) + return pd.DataFrame.from_dict(x_train_dict) + + def sqlflow_train_loop(self, dataset, epochs=1, verbose=0): + ite = make_one_shot_iterator(dataset) ite.get_next() - x_df, y_df = self._to_dataframe(x) + x_df, y_df = self._to_dataframe(dataset) + x = self._woe_encoder(x_df, y_df) + self._lr = LogisticRegression() - x_train_dict = {} - for col in x_df.columns: - dfx, cut, woe, iv = self._pr_bin(y_df['label'], x_df[col]) - self._bins[col] = (dfx, cut, woe, iv) - - x_replaced_woe = self._replace_woe(x_df[col], cut, woe) - x_train_dict[col] = x_replaced_woe + x_train, x_test, y_train, y_test = train_test_split(x, y_df) + self._lr.fit(x_train, y_train) - x_train = pd.DataFrame.from_dict(x_train_dict) - self._lr.fit(x_train, y_df['label']) - coe = self._lr.coef_ + prob = self._lr.predict_proba(x_test)[:, 1] + auc_score = roc_auc_score(y_test, prob) + print("AUC: {}\n".format(auc_score)) + + # show scores + print("The scores for each bins:") + coe = self._lr.coef_ for i, col_name in enumerate(x_df.columns): bin_cols = self._bins[col_name][0].index.to_list() for j, w in enumerate(self._bins[col_name][2]): @@ -107,6 +130,7 @@ def load_weights(self, save): def predict_on_batch(self, features): if not self._is_first_predict_batch: + # SQLFLow would call this function once to warm up self._is_first_predict_batch = True return None x_df, _ = self._to_dataframe([(features, None)]) @@ -114,9 +138,4 @@ def predict_on_batch(self, features): for col in x_df.columns: bin = self._bins[col] x_train_dict[col] = self._replace_woe(x_df[col], bin[1], bin[2]) - r = self._lr.predict_proba(pd.DataFrame.from_dict(x_train_dict)) - return r - -def prepare_prediction_column(prediction): - """Return the class label of highest probability.""" - return prediction.argmax(axis=-1) + return self._lr.predict_proba(pd.DataFrame.from_dict(x_train_dict)) From 98d92a2494dde45bddca97abb184b039ff8b7326 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 20 Apr 2020 16:57:19 +0800 Subject: [PATCH 4/5] update model name --- sqlflow_models/__init__.py | 2 +- sqlflow_models/score_card.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sqlflow_models/__init__.py b/sqlflow_models/__init__.py index 711e6e7..bfd88a8 100755 --- a/sqlflow_models/__init__.py +++ b/sqlflow_models/__init__.py @@ -6,4 +6,4 @@ from .dnnclassifier_functional_api_example import dnnclassifier_functional_model from .lstm_based_time_series_beta0 import LSTMBasedTimeSeriesModel from .auto_estimator import AutoClassifier, AutoRegressor -from .score_card import MyScoreCard \ No newline at end of file +from .score_card import ScoreCard \ No newline at end of file diff --git a/sqlflow_models/score_card.py b/sqlflow_models/score_card.py index 6082f53..1392828 100644 --- a/sqlflow_models/score_card.py +++ b/sqlflow_models/score_card.py @@ -29,10 +29,10 @@ def prepare_prediction_column(prediction): return prediction.argmax(axis=-1) -class MyScoreCard(keras.Model): +class ScoreCard(keras.Model): def __init__(self, feature_columns=None): - super(MyScoreCard, self).__init__(name='ScoreCard') + super(ScoreCard, self).__init__(name='ScoreCard') self._factor = 20/np.log(2) self._offset = 600 - 20*np.log(20) / np.log(2) From 019ebb41e16ac79a70439d17a2627a3d1e70cf83 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 22 Sep 2020 20:17:48 +0800 Subject: [PATCH 5/5] update --- sqlflow_models/score_card.py | 82 ++++++++++-------------------------- 1 file changed, 22 insertions(+), 60 deletions(-) diff --git a/sqlflow_models/score_card.py b/sqlflow_models/score_card.py index 1392828..914de64 100644 --- a/sqlflow_models/score_card.py +++ b/sqlflow_models/score_card.py @@ -16,19 +16,13 @@ def optimizer(): - # SGD is just a placeholder to avoid panic on SQLFLow traning - return SGD(lr=0.1, momentum=0.9) + return None def loss(): return None -def prepare_prediction_column(prediction): - """Return the class label of highest probability.""" - return prediction.argmax(axis=-1) - - class ScoreCard(keras.Model): def __init__(self, feature_columns=None): @@ -36,12 +30,8 @@ def __init__(self, feature_columns=None): self._factor = 20/np.log(2) self._offset = 600 - 20*np.log(20) / np.log(2) - self._is_first_predict_batch = False self._bins = dict() - def call(self): - pass - def _pf_bin(self, y, x, n=10): # population frequency bucket bad_num = y.sum() @@ -56,8 +46,8 @@ def _pf_bin(self, y, x, n=10): d3["goodcostum"] = d2.count().y - d2.sum().y d3["total"] = d2.count().y d3["bad_rate"] = d2.sum().y/d2.count().y - d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"] * good_num/ bad_num) - iv = ((d3["badcostum"]/bad_num - d3["goodcostum"]/good_num)*d3["woe"]) + d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"]*good_num/bad_num) + iv = ((d3["badcostum"]/bad_num-d3["goodcostum"]/good_num)*d3["woe"]) d3["iv"] = iv woe = list(d3["woe"].round(6)) cut = list(d3["max"].round(6)) @@ -68,20 +58,16 @@ def _pf_bin(self, y, x, n=10): def _to_dataframe(self, dataset): x_df = pd.DataFrame() y_df = pd.DataFrame() - - for features, label in dataset: + for _, minibatch in enumerate(dataset): + data, label = minibatch dx = {} dy = {} - for name, value in features.items(): - dx[name] = value.numpy()[0] + for name, value in data.items(): + dx[name] = value.numpy()[0][0] x_df = x_df.append(dx, ignore_index=True) - if label is not None: - dy['label'] = label.numpy()[0][0] - y_df = y_df.append(dy, ignore_index=True) - - if y_df.empty: - return x_df, None - return x_df, y_df['label'] + dy['label'] = label.numpy()[0] + y_df = y_df.append(dy, ignore_index=True) + return x_df, y_df def _replace_woe(self, x, cut, woe): return pd.cut(x, cut, labels=pd.Categorical(woe)) @@ -91,51 +77,27 @@ def _woe_encoder(self, x, y): for col in x.columns: dfx, cut, woe, iv = self._pf_bin(y, x[col]) self._bins[col] = (dfx, cut, woe, iv) - # replace by woe encoder + # replacing by the WOE encode x_train_dict[col] = self._replace_woe(x[col], cut, woe) - return pd.DataFrame.from_dict(x_train_dict) def sqlflow_train_loop(self, dataset, epochs=1, verbose=0): - ite = make_one_shot_iterator(dataset) - ite.get_next() - x_df, y_df = self._to_dataframe(dataset) - x = self._woe_encoder(x_df, y_df) - self._lr = LogisticRegression() - - x_train, x_test, y_train, y_test = train_test_split(x, y_df) - self._lr.fit(x_train, y_train) + x = self._woe_encoder(x_df, y_df['label']) + x.to_csv("/tmp/train_woe.csv") + lr = LogisticRegression() - prob = self._lr.predict_proba(x_test)[:, 1] + x_train, x_test, y_train, y_test = train_test_split(x, y_df['label']) + lr.fit(x_train, y_train) + prob = lr.predict_proba(x_test)[:, 1] auc_score = roc_auc_score(y_test, prob) print("AUC: {}\n".format(auc_score)) - - # show scores - print("The scores for each bins:") - coe = self._lr.coef_ + # print the score card + print("THE SCORE CARD:") + coe = lr.coef_ for i, col_name in enumerate(x_df.columns): bin_cols = self._bins[col_name][0].index.to_list() for j, w in enumerate(self._bins[col_name][2]): - print(col_name, bin_cols[j], round(coe[0][i] * w * self._factor, 0)) - - def save_weights(self, save="", save_format="h5"): - pickle.dump(self._lr, open(save, 'wb')) - pickle.dump(self._bins, open(save+"_bin", 'wb')) - - def load_weights(self, save): - self._lr = pickle.load(open(save, 'rb')) - self._bins = pickle.load(open(save+"_bin", 'rb')) - - def predict_on_batch(self, features): - if not self._is_first_predict_batch: - # SQLFLow would call this function once to warm up - self._is_first_predict_batch = True - return None - x_df, _ = self._to_dataframe([(features, None)]) - x_train_dict = {} - for col in x_df.columns: - bin = self._bins[col] - x_train_dict[col] = self._replace_woe(x_df[col], bin[1], bin[2]) - return self._lr.predict_proba(pd.DataFrame.from_dict(x_train_dict)) + print(col_name, bin_cols[j], + round(coe[0][i] * w * self._factor, 0))