-
机器学习第2章端到端的机器学习项目
机器学习实战:基于Scikit-Learn和TensorFlow的笔记
参考:作者的Jupyter Notebook
Chapter 2 – End-to-end Machine Learning project
-
下载数据
- 打开vscode,建立新的python文件,输入以下代码,下载housing.tgz文件,并将housing.csv解压到这个目录
import os import tarfile from six.moves import urllib download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/" HOUSING_PATH = "datasets/housing" HOUSING_URL = download_root + HOUSING_PATH + "/housing.tgz" def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH): if not os.path.isdir(housing_path): os.makedirs(housing_path) tgz_path = os.path.join(housing_path, "housing.tgz") urllib.request.urlretrieve(housing_url, tgz_path) housing_tgz = tarfile.open(tgz_path) housing_tgz.extractall(path=housing_path) housing_tgz.close() fetch_housing_data()
下载后可将函数注释
-
快速查看数据结构
- 使用pandas加载数据
mport pandas as pd def load_housing_data(housing_path=HOUSING_PATH): csv_path = os.path.join(housing_path, "housing.csv") return pd.read_csv(csv_path)
函数返回一个包含所有数据的Pandas DataFrame对象
- 调用DataFrames的head()方法查看前5行数据(由于使用的是vscode所以会和书里有所不同),查看完可注释
housing = load_housing_data() print(housing.head())
总共有10个属性
-
通过info()方法可以快速获取数据集的简单描述,特别是总行数、每个属性的类型和非空值的数量
print(housing.info())
-
使用value_counts()方法查看有多少种分类存在,每种类别下分别有多少个区域
print(housing["ocean_proximity"].value_counts())
-
通过describe()方法可以显示数值属性的摘要
print(housing.describe())
-
在整个数据集上调用hist()方法,绘制每个属性的直方图
import matplotlib.pyplot as plt housing.hist(bins=50, figsize=(50,15)) plt.show()
-
创建测试集
- 理论上,创建测试集非常简单:只需要随机选择一些实例,通常是数据集的20%,然后将它们放在一边:
import numpy as np def split_train_test(data, test_ratio): shuffled_indices = np.random.permutation(len(data)) test_set_size = int(len(data) * test_ratio) test_indices = shuffled_indices[:test_set_size] train_indices = shuffled_indices[test_set_size:] return data.iloc[train_indices], data.iloc[test_indices] train_set, test_set = split_train_test(housing, 0.2) print(len(train_set), "train +", len(test_set), "test")
- 但这并不完美:如果你再运行一遍,它又会产生一个不同的数据集!这样下去,你(或者是你的机器学习算法)将会看到整个完整的数据集,而这正是创建测试集时需要避免的。常见的解决办法是每个实例都使用一个标识符(identifier)来决定是否进入测试集(假定每个实例都有一个唯一且不变的标识符)
import hashlib def test_set_check(identifier,test_ratio, hash): return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5): ids = data[id_column] in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash)) return data.loc[~in_test_set], data.loc[in_test_set] #housing_with_id = housing.reset_index() #housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"] #train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id") from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(housing, test_size=0.2, random=42)
- 分层抽样
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] print(housing["income_cat"].value_counts() / len(housing)) for set in (strat_train_set, strat_test_set): set.drop(["income_cat"], axis=1, inplace=True)
-
数据探索和可视化
-
创建一个副本
housing = strat_train_set.copy()
- 将地理数据可视化
#housing.plot(kind="scatter", x="longitude", y="latitude") #housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"] / 100, label="population", c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,) plt.legend() plt.show()
- 寻找相关性
#corr_matrix = housing.corr() #print(corr_matrix["median_house_value"].sort_values(ascending=False)) from pandas.plotting import scatter_matrix #少了tools attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] scatter_matrix(housing[attributes], figsize=(12, 8)) housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1) plt.show()
-
创建一个副本
-
试验不同属性的组合
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"]=housing["population"]/housing["households"] corr_matrix = housing.corr() print(corr_matrix["median_house_value"].sort_values(ascending=False))
-
机器学习算法的数据准备
housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy()
-
数据清理4选1
#housing.dropna(subset=["total_bedrooms"]) # option 1 #housing.drop("total_bedrooms", axis=1) # option 2 #median = housing["total_bedrooms"].median() #housing["total_bedrooms"].fillna(median) # option 3 #option4: Scikit-Learn提供的imputer, 指定你要用属性的中位数值替换该属性的缺失值 from sklearn.impute import SimpleImputer #与书中不同,进化了 imputer = SimpleImputer(strategy="median") #创建一个imputer实例 housing_num = housing.drop("ocean_proximity", axis=1) #创建一个没有文本属性的数据副本ocean_proximity imputer.fit(housing_num) #使用fit()方法将imputer实例适配到训练集 #print(imputer.statistics_) #print(housing_num.median().values) X = imputer.transform(housing_num) #替换 housing_tr = pd.DataFrame(X, columns=housing_num.columns) #放回Pandas DataFrame
-
处理文本和分类属性
#先将这些文本标签转化为数字,Scikit-Learn为这类任务提供了一个转换器LabelEncoder: from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] housing_cat_encoded = encoder.fit_transform(housing_cat) #print(housing_cat_encoded) #print(encoder.classes_) #Scikit-Learn提供了一个OneHotEncoder编码器,可以将整数分类值转换为独热向量 from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) #print(housing_cat_1hot.toarray()) #使用LabelBinarizer类可以一次性完成两个转换 from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer() housing_cat_1hot = encoder.fit_transform(housing_cat) print(housing_cat_1hot)
-
自定义转换器
from sklearn.base import BaseEstimator, TransformerMixin rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self #nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values)
-
转换流水线
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) #print(housing_num_tr) from sklearn.compose import ColumnTransformer num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) #print(housing_prepared) #print(housing_prepared.shape)
-
选择和训练模型
- 训练一个线性回归模型:
from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) #print(lin_reg) #实例试试 some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) #print("Predictions:", lin_reg.predict(some_data_prepared)) #print("Labels:", list(some_labels)) #print(some_data_prepared)
- 使用Scikit-Learn的mean_squared_error函数来测量整个训练集上回归模型的RMSE:
from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) #print(lin_rmse) from sklearn.metrics import mean_absolute_error lin_mae = mean_absolute_error(housing_labels, housing_predictions) #print(lin_mae)
- 我们来训练一个(决策树)DecisionTreeRegressor。
from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor(random_state=42) tree_reg.fit(housing_prepared, housing_labels) housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) #print(tree_rmse) #可能对数据严重过度拟合
- 使用交叉验证来更好地进行评估
from sklearn.model_selection import cross_val_score scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) def display_scores(scores): print("Scores:", scores) print("Mean:", scores.mean()) print("Standard deviation:", scores.std()) #display_scores(tree_rmse_scores)
- 计算一下线性回归模型的评分
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) #display_scores(lin_rmse_scores)
- 随机森林模型RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor(n_estimators=10, random_state=42) forest_reg.fit(housing_prepared, housing_labels) housing_predictions = forest_reg.predict(housing_prepared) forest_mse = mean_squared_error(housing_labels, housing_predictions) forest_rmse = np.sqrt(forest_mse) #print(forest_rmse) from sklearn.model_selection import cross_val_score forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) #display_scores(forest_rmse_scores) scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) #print(pd.Series(np.sqrt(-scores)).describe())
-
微调模型
-
网格搜索
#你可以用Scikit-Learn的GridSearchCV来替你进行探索。你所要做的只是告诉它你要进行实验的超参数是什么,以及需要尝试的值,它将会使用交叉验证来评估超参数值的所有可能的组合。 #下面这段代码搜索RandomForestRegressor的超参数值的最佳组合: #当你不知道超参数应该赋什么值时,一个简单的方法是连续尝试10的幂次方 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, # try 12 (3×4) combinations of hyperparameters {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, # then try 6 (2×3) combinations with bootstrap set as False ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(housing_prepared, housing_labels) #print(grid_search.best_params_) #print(grid_search.best_estimator_) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) print(pd.DataFrame(grid_search.cv_results_)) #随机搜索 #集成方法
-
分析最佳模型及其错误
feature_importances = grid_search.best_estimator_.feature_importances_ #print(feature_importances) #将这些重要性分数显示在对应的属性名称旁边: extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"] #cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs sorted(zip(feature_importances, attributes), reverse=True) #print(sorted(zip(feature_importances, attributes), reverse=True)) #通过测试集评估系统 from sklearn.metrics import mean_squared_error final_model = grid_search.best_estimator_ X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) #print(final_rmse)
-
启动、监控和维护系统
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
SQL Server -- 解决存储过程传入参数作为s
JavaScript判断两个数组相等的四类方法
js如何操作video标签
React实战--利用甘特图和看板,强化Paas平
【记录】正则替换的偏方
前端下载 Blob 类型整理
抽象语法树AST必知必会
关于JS定时器的整理
JS中使用Promise.all控制所有的异步请求都完
js中字符串的方法
import-local执行流程与node模块路径解析流程