KNN
缺点:每次计算得遍历所有样本
注:以下导入,处理是对trainmat,但testmat同理
数据格式
- 特征值-标签
- 回归问题
40920 8.326976 0.953952 largeDoses 14488 7.153469 1.673904 smallDoses 26052 1.441871 0.805124 didntLike>x [n, 3] >y [n,] #### 算法
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3) X = np.c_[country_stats["GDP per capita"]] y = np.c_[country_stats["Life satisfaction"]] # Train the model model.fit(X, y) # Make a prediction for Cyprus X_new = np.array([[22587.0]]) # Cyprus' GDP per capita print(model.predict(X_new)) # outputs [[ 5.76666667]] # =======================用同一集训练和验证,差==============================- 还有数据格式(20640, 16),label(20640,)的 ### 评估
#使用mean_squared_error,测量整个训练集上的RMSE from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) lin_rmse #使用mean_absolute_error,测量整个训练集上的MAE from sklearn.metrics import mean_absolute_error lin_mae = mean_absolute_error(housing_labels, housing_predictions) lin_mae### 评估 (二)
from sklearn.model_selection import cross_val_score scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) def display_score(scores): print("Scores:", scores) print("Mean:", scores.mean()) print("Standard deviation:", scores.std()) display_score(tree_rmse_scores)
数据格式
- 二进制图像
- 分类问题
eg:手写数字识别
00000000000000111000000000000000 00000000000001111000000000000000 00000000000111111110000000000000 00000000000111111111000000000000 00000000011111111111100000000000 00000000011111101111100000000000 00000000111111000011110000000000 00000000111111000011111000000000 00000001111110000000111100000000 00000001111100000000111100000000 00000011111100000000111100000000 00000011111100000000111100000000 00000011110000000000111100000000 00000001111000000000011110000000 00000001111000000000001111000000 00000001111100000000001111000000 00000011111100000000001111000000 00000011111000000000001111000000 00000011111000000000001111000000 00000011111000000000001110000000 00000001111100000000001111000000 00000000111100000000001111000000 00000000111100000000001111000000 00000000111100000000011111000000 00000000111100000000111110000000 00000000111100000001111100000000 00000000011100000011111100000000 00000000001111100111111100000000 00000000000111111111111000000000 00000000000111111111100000000000 00000000000111111111000000000000 00000000000011111100000000000000> x [n, 32*32] > y [n,] #### 算法
#knn from sklearn.neighbors import KNeighborsClassifier #begin time start = time.clock() #progressing knn_clf=KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance', p=3) score = cross_val_score(knn_clf, X_train_small, y_train_small, cv=3) print( score.mean() ) #end time elapsed = (time.clock() - start) print("Time used:",int(elapsed), "s") #k=3 #0.942300738697 #0.946100822903 weights='distance' #0.950799888775 p=3 #k=5 #0.939899237556 #0.94259888029 #k=7 #0.935395994386 #0.938997377902 #k=9 #0.933897851978
#最后用全量数据训练,提交kaggle。代码模版 clf=knn_clf start = time.clock() clf.fit(X_train,y_train) elapsed = (time.clock() - start) print("Training Time used:",int(elapsed/60) , "min") result=clf.predict(X_test) result = np.c_[range(1,len(result)+1), result.astype(int)] df_result = pd.DataFrame(result, columns=['ImageId', 'Label']) df_result.to_csv('./results.knn.csv', index=False) #end time elapsed = (time.clock() - start) print("Test Time used:",int(elapsed/60) , "min")