MNIST
Setup
# To support both python 2 and python 3 from __future__ import division, print_function, unicode_literals # Common imports import numpy as np import os # to make this notebook's output stable across runs np.random.seed(42) # To plot pretty figures %matplotlib inline import matplotlib import matplotlib.pyplot as plt plt.rcParams['axes.labelsize'] = 14 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 # Where to save the figures PROJECT_ROOT_DIR = "." CHAPTER_ID = "classification" def save_fig(fig_id, tight_layout=True): path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png") print("Saving figure", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format='png', dpi=300)### 训练二元分类器 ### 性能考核 - 自行实施交叉验证StratifiedKFold - cross_val_score()返回准确度(评估分数)
#返回精确度 cross_cal_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
混淆矩阵
- cross_val_predict()返回预测结果
y_train_pred = cross_val_predict(sgd_clf, X_train, y_tarin, cv=3)
- confusion_matrix()返回混淆矩阵
confusion_matrix(y_train_5, y_train_pred)
RP曲线
- 精确率
precision_score(y_train_5, y_train_pred)
- 召回率
recall_score(y_train_5, y_train_pred)
- F1分数
f1_score
- 制作RP曲线
cross_val_predict获得分数
#代码是MNIST的二元分类器的First step, Get Score ,preparing for Thresholds
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
#当cross_val_predict含method时返回函数值method='decision_function')
#随机梯度下降的decision_function函数返回的是分数
#随机森林的predict_proba返回概率,不过也可切片后用作分数,见下ROCprecision_recall_curve计算精度和召回率
Next,caculate
from sklearn.matrics import precision_recall_curve(y_train,y_scores)
precisions, recalls,thresholds = precision_recall_curve(y_train_5, y_scores)
#recalls.shape = precisions.shape = thresholds.shape(Nums_of_Samples,)
绘制RP曲线,寻找最优阈值
Third,plot
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], “b–”, label=”Precision”, linewidth=2)
plt.plot(thresholds, recalls[:-1], “g-“, label=”Recall”, linewidth=2)
plt.xlabel(“Threshold”, fontsize=16)
plt.legend(loc=”upper left”, fontsize=16)
plt.ylim([0, 1])
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])
save_fig(“precision_recall_vs_threshold_plot”)
plt.show()
优化,阈值调整,重新训练
#Change y_train_pred_90 = (y_score > 70000)
ROC曲线
概念:
召回率/假正类率FPR(错误被分为正类的负类实例比率)
得到score
y_probs_forest = cross_val_predict(froest_clf, X_train, y_train, cv=3, method='predict_proba') y_scores_forest = y_probas_forest[:,1]
roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
plot
def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) plt.figure(figsize=(8, 6)) plot_roc_curve(fpr, tpr) #save_fig("roc_curve_plot") plt.show()
- 另类:ROC AUC曲线下面积