网格搜索
#为随机森林回归找到最佳值 from sklearn.model_selection import GridSearchCV param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, ] forest_reg = RandomForestRegressor(random_state=42) grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True) grid_search.fit(housing_prepared, housing_labels) #查看最佳参数组合 grid_search.best_params_ #得到最好的估算器 grid_search.best_estimator_ #让我们看一下在网格搜索期间测试的每个超参数组合的得分: cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score),params) pd.DataFrame(grid_search.cv_results_)
随机搜索
## 随机搜索 from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint param_distribs={ 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), } forest_reg = RandomForestRegressor(random_state=42) rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42) rnd_search.fit(housing_prepared, housing_labels) cvers = rnd_search.cv_results_ for mean_score, params in zip(cvers["mean_test_score"], cvers["params"]): print(np.sqrt(-mean_score), params)
分析最佳模型及其错误¶
RandomForestRegressor可以指出每个属性的相对重要程度
features_importance = grid_search.best_estimator_.feature_importances_ features_importance # 前提回顾 # num_attribs = list(housing_num) # cat_attribs = ["ocean_proximity"] # full_pipeline = ColumnTransformer([ # ("num", num_pipeline, num_attribs), # ("cat", OneHotEncoder(), cat_attribs), # ]) list(cat_encoder.categories_[0]) #['<1h ocean',="" 'inland',="" 'island',="" 'near="" bay',="" ocean']="" #="" 将这些重要性分数显示在对应的属性名称旁边="" extra_attribs="["rooms_per_hhold"," "pop_per_hhold",="" "bedrooms_per_room"]="" #cat_encoder="cat_pipline.named_steps["cat_encoder"]" #old="" solution="" cat_one_hot_attribs="list(cat_encoder.categories_[0])" attributes="num_attribs" +="" sorted(zip(features_importance,="" attributes),="" reverse="True)" <="" pre="">
有了这些信息,你可以尝试删除一些不太有用的特征(例如本例中只有ocean_proximity是有用的,我们可以删除其他所有特征)¶
然后,还可以查看下系统产生的具体错误,尝试了解是怎么产生的,以及该怎么解决(通过添加额外特征,或是删除没有信息的特征,清除异常值,等等)
通过测试机评估系统¶
过程:
- 从测试集中获取预测器和标签
- 运行full_pipline来转换数据(调用Transform()而不是fit_transform())
- 在测试集上评估最终模型
final_model = grid_search.best_estimator_ X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) final_rmse #final_rmse1h>