微调模型,分析与测试

网格搜索

#为随机森林回归找到最佳值
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},

    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},

]
forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

#查看最佳参数组合
grid_search.best_params_

#得到最好的估算器
grid_search.best_estimator_

#让我们看一下在网格搜索期间测试的每个超参数组合的得分:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score),params)

pd.DataFrame(grid_search.cv_results_)

随机搜索

## 随机搜索
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs={
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                               n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

cvers = rnd_search.cv_results_
for mean_score, params in zip(cvers["mean_test_score"], cvers["params"]):
    print(np.sqrt(-mean_score), params)

分析最佳模型及其错误¶

RandomForestRegressor可以指出每个属性的相对重要程度

features_importance = grid_search.best_estimator_.feature_importances_
features_importance

# 前提回顾
# num_attribs = list(housing_num)
# cat_attribs = ["ocean_proximity"]

# full_pipeline = ColumnTransformer([
#         ("num", num_pipeline, num_attribs),
#         ("cat", OneHotEncoder(), cat_attribs),
#     ])

list(cat_encoder.categories_[0])
#['<1h ocean',="" 'inland',="" 'island',="" 'near="" bay',="" ocean']="" #="" 将这些重要性分数显示在对应的属性名称旁边="" extra_attribs="["rooms_per_hhold"," "pop_per_hhold",="" "bedrooms_per_room"]="" #cat_encoder="cat_pipline.named_steps["cat_encoder"]" #old="" solution="" cat_one_hot_attribs="list(cat_encoder.categories_[0])" attributes="num_attribs" +="" sorted(zip(features_importance,="" attributes),="" reverse="True)" <="" pre="">

  • 有了这些信息,你可以尝试删除一些不太有用的特征(例如本例中只有ocean_proximity是有用的,我们可以删除其他所有特征)¶

  • 然后,还可以查看下系统产生的具体错误,尝试了解是怎么产生的,以及该怎么解决(通过添加额外特征,或是删除没有信息的特征,清除异常值,等等)

通过测试机评估系统¶

过程:

  • 从测试集中获取预测器和标签
  • 运行full_pipline来转换数据(调用Transform()而不是fit_transform())
  • 在测试集上评估最终模型
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse
#final_rmse