VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > 编程开发 > 数据分析 >
  • 机器学习第3章分类

机器学习实战:基于Scikit-Learn和TensorFlow的笔记

参考:作者的Jupyter Notebook
Chapter 2 – End-to-end Machine Learning project

  1. 获取MNIST数据集的代码:

    def sort_by_target(mnist):
        reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
        reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
        mnist.data[:60000] = mnist.data[reorder_train]
        mnist.target[:60000] = mnist.target[reorder_train]
        mnist.data[60000:] = mnist.data[reorder_test + 60000]
        mnist.target[60000:] = mnist.target[reorder_test + 60000]
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
    
  2. 查看这些数组

    #print(mnist["data"], mnist["target"])
    #print(mnist.data.shape)
    X, y = mnist["data"], mnist["target"]
    #print(X.shape)
    #print(y.shape)
    
    some_digit = X[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap = mpl.cm.binary,
            interpolation="nearest")
    plt.axis("off")
    #plt.show()
    #print(y[36000])
    
  3. MNIST数据集中的部分数字图像

    def plot_digits(instances, images_per_row=10, **options):
        size = 28
        images_per_row = min(len(instances), images_per_row)
        images = [instance.reshape(size,size) for instance in instances]
        n_rows = (len(instances) - 1) // images_per_row + 1
        row_images = []
        n_empty = n_rows * images_per_row - len(instances)
        images.append(np.zeros((size, size * n_empty)))
        for row in range(n_rows):
            rimages = images[row * images_per_row : (row + 1) * images_per_row]
            row_images.append(np.concatenate(rimages, axis=1))
        image = np.concatenate(row_images, axis=0)
        plt.imshow(image, cmap = mpl.cm.binary, **options)
        plt.axis("off")
    
    plt.figure(figsize=(9,9))
    example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
    plot_digits(example_images, images_per_row=10)
    #save_fig("more_digits_plot")
    #plt.show()
    
  4. 给数据集洗牌

    X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    shuffle_index = np.random.permutation(60000)
    X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
    
  5. 训练一个二元分类器,为此分类任务创建目标向量:

    y_train_5 = (y_train == 5)  # True for all 5s, False for all other digits.
    y_test_5 = (y_test == 5)
    
  6. 创建一个SGDClassifier(随机梯度下降分类器)并在整个训练集上进行训练:

    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)   #random_state=42
    sgd_clf.fit(X_train, y_train_5)
    #print(sgd_clf.fit(X_train, y_train_5))
    #现在可以用它来检测数字5的图像了:
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
    
  7. 交叉验证

    from sklearn.model_selection import cross_val_score
    cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
    print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
    
    #下面这段代码与前面的cross_val_score()大致相同,并打印出相同的结果:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    skfolds = StratifiedKFold(n_splits=3, random_state=42)   #random_state=42
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train_5[train_index])
        X_test_fold = X_train[test_index]
        y_test_fold = (y_train_5[test_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
    
  8. 一个蠢笨的分类器(不是我说的),它将每张图都分类成“非5”:

    from sklearn.base import BaseEstimator
    class Never5Classifier(BaseEstimator):
        def fit(self, X, y=None):
            pass
        def predict(self, X):
            return np.zeros((len(X), 1), dtype=bool)
    #准确度
    never_5_clf = Never5Classifier()
    print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
    
  9. 混淆矩阵:评估分类器性能的更好方法是混淆矩阵。

    from sklearn.model_selection import cross_val_predict
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
    #cross_val_predict()函数同样执行K-fold交叉验证,但返回的不是评估分数,而是每个折叠的预测。这意味着对于每个实例都可以得到一个干净的预测
    from sklearn.metrics import confusion_matrix
    #confusion_matrix(y_train_5, y_train_pred)
    #print(confusion_matrix(y_train_5, y_train_pred))
    y_train_perfect_predictions = y_train_5
    #print(confusion_matrix(y_train_5, y_train_perfect_predictions))
    
  10. 精度和召回率

    #精度=TP/(TP+FP):TP是真正类的数量,FP是假正类的数量。
    #召回率=TP/(TP+FN):FN是假负类的数量。
    from sklearn.metrics import precision_score, recall_score
    print(precision_score(y_train_5, y_train_pred))  #精度4344 / (4344 + 1307)
    print(recall_score(y_train_5, y_train_pred))  #召回率4344 / (4344 + 1077)
    
    #F1分数:F1=2/(1/精度+1/召回率)=TP/(TP+(FN+FP)/2)
    from sklearn.metrics import f1_score
    print(f1_score(y_train_5, y_train_pred))
    
  11. 精度/召回率权衡:阈值

    y_scores = sgd_clf.decision_function([some_digit])
    print(y_scores)
    threshold = 0
    y_some_digit_pred = (y_scores > threshold)
    print(y_some_digit_pred)
    #提高阈值
    threshold = 200000
    y_some_digit_pred_a = (y_scores > threshold)
    print(y_some_digit_pred_a)
    
  12. 决定使用什么阈值

    #获取训练集中所有实例的分数
    y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
    #计算所有可能的阈值的精度和召回率
    from sklearn.metrics import precision_recall_curve
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    #使用Matplotlib绘制精度和召回率相对于阈值的函数图
    def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
        plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
        plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
        plt.xlabel("Threshold")
        plt.legend(loc="upper left")
        plt.ylim([0, 1])
    plt.figure(figsize=(8, 4))
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plt.xlim([-700000, 700000])
    plt.show()
    #print((y_train_pred == (y_scores > 0)).all())
    y_train_pred_90 = (y_scores > 70000)
    from sklearn.metrics import precision_score, recall_score
    print(precision_score(y_train_5, y_train_pred_90)) #精度
    print(recall_score(y_train_5, y_train_pred_90)) #召回率
    
  13. 精度和召回率的函数图PR

    def plot_precision_vs_recall(precisions, recalls):
        plt.plot(recalls, precisions, "b-", linewidth=2)
        plt.xlabel("Recall", fontsize=16)
        plt.ylabel("Precision", fontsize=16)
        plt.axis([0, 1, 0, 1])
    
    plt.figure(figsize=(8, 6))
    plot_precision_vs_recall(precisions, recalls)
    plt.show()
    
  14. ROC曲线(受试者工作特征曲线):真正类率和假正类率

    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate', fontsize=16)
        plt.ylabel('True Positive Rate', fontsize=16)
    '''
    plt.figure(figsize=(8, 6))
    plot_roc_curve(fpr, tpr)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores))
    
  15. 训练一个RandomForestClassifier分类器,并比较它和SGDClassifier分类器的ROC曲线和ROC AUC分数。

    from sklearn.ensemble import RandomForestClassifier
    forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
    y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
    plt.legend(loc="lower right", fontsize=16)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores_forest))
    
  16. 多类别分类器,用SGDClassifier试试

    #用SGDClassifier试试:
    sgd_clf.fit(X_train, y_train)
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
    some_digit_scores = sgd_clf.decision_function([some_digit])
    #print(some_digit_scores)
    #print(np.argmax(some_digit_scores))
    #print(sgd_clf.classes_)
    #print(sgd_clf.classes_[5])
    
    #下面这段代码使用OvO策略,基于SGDClassifier创建了一个多类别分类器:
    from sklearn.multiclass import OneVsOneClassifier
    ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))
    ovo_clf.fit(X_train, y_train)
    ovo_clf.predict([some_digit])
    len(ovo_clf.estimators_)
    #print(ovo_clf.predict([some_digit]))
    #print(len(ovo_clf.estimators_))
    
  17. 训练RandomForestClassifier

    from sklearn.model_selection import cross_val_score
    forest_clf.fit(X_train, y_train)
    #print(forest_clf.predict([some_digit]))
    #print(forest_clf.predict_proba([some_digit]))  #概率列表
    #print(cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy"))  #准确率
    #将输入进行简单缩放
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    #print(cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy"))
    
  18. 使用Matplotlib的matshow()函数来查看混淆矩阵

    y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)
    #print(conf_mx)
    #使用Matplotlib的matshow()函数来查看混淆矩阵的图像表示
    #plt.matshow(conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_plot", tight_layout=False)
    
  19. 你需要将混淆矩阵中的每个值除以相应类别中的图片数量,这样你比较的就是错误率而不是错误的绝对值

    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    #用0填充对角线,只保留错误,重新绘制结果:
    np.fill_diagonal(norm_conf_mx, 0)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_errors_plot", tight_layout=False)
    
  20. 看看数字3和数字5的例子:

    cl_a, cl_b = 3, 5
    X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
    X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
    X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
    X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
    
    plt.figure(figsize=(8,8))
    plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
    plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
    plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
    plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
    #save_fig("error_analysis_digits_plot")
    
  21. 多标签分类

    #这段代码会创建一个y_multilabel数组,其中包含两个数字图片的目标标签:第一个表示数字是否是大数(7、8、9),第二个表示是否为奇数。
    from sklearn.neighbors import KNeighborsClassifier
    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)
    y_multilabel = np.c_[y_train_large, y_train_odd]
    
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_multilabel)
    #print(knn_clf.fit(X_train, y_multilabel))
    #下一行创建一个KNeighborsClassifier实例(它支持多标签分类,不是所有的分类器都支持),然后使用多个目标数组对它进行
    #训练。现在用它做一个预测,注意它输出的两个标签:
    knn_clf.predict([some_digit])    #数字5确实不大(False),为奇数(True)。
    #print(knn_clf.predict([some_digit]))
    
  22. 下面这段代码计算所有标签的平均F1分数:

    from sklearn.metrics import f1_score
    y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
    f1_score(y_multilabel, y_train_knn_pred, average="macro")
    #print(f1_score(y_multilabel, y_train_knn_pred, average="macro"))
    
  23. 多输出分类(多输出-多类别分类)

    #还先从创建训练集和测试集开始,使用NumPy的randint()函数
    #为MNIST图片的像素强度增加噪声。目标是将图片还原为原始图片:
    noise = np.random.randint(0, 100, (len(X_train), 784))
    X_train_mod = X_train + noise
    noise = np.random.randint(0, 100, (len(X_test), 784))
    X_test_mod = X_test + noise
    y_train_mod = X_train
    y_test_mod = X_test
    
    some_index = 5500
    #plt.subplot(121); plot_digit(X_test_mod[some_index])
    #plt.subplot(122); plot_digit(y_test_mod[some_index])
    #save_fig("noisy_digit_example_plot")
    
  24. 清洗这张图片:

    knn_clf.fit(X_train_mod, y_train_mod)
    clean_digit = knn_clf.predict([X_test_mod[some_index]])
    plot_digit(clean_digit)
    save_fig("cleaned_digit_example_plot")
    plt.show()

相关教程