DC竞赛房价预测

0x00 概述

比赛名称：计算机应用大赛(DC房价预测)
队名： 0x01
最终名次： 51
获奖： 二等奖

比赛基本是我打得，收获一些了，做一下记录 ## 0x01 值得留意的 - [赛后别人的开源~](https://github.com/notplaid/prices) - 聊天记录的经验分享(见自己笔记) - 自己的特征工程 - Lgb，Xgb现学现卖的使用 **放一个别人做得处理流程图** ![流程图](https://mmbiz.qpic.cn/mmbiz_png/E5ayiaFQcPF3riaR75qWyj4evW8RaO8TShicuuB21jsl7eUnxTib1RVVhNTNW6CcicFyx3TRhgS36AUmibnGZXbEib2sw/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1) ## 0x02 感想骚操作： - 犬哥的去验证集，是我见过最骚的没有之一···(效果是大幅提升) - 简单但是有效的：去早期停止，减小学习率，增加轮次 ## 0x03 特征工程记录下我的处理房屋朝向的代码:

def check_bool(arr, str_):
    bool_list = []
    for i in arr:
        if str_ in i:
            bool_list.append(True)
        else:
            bool_list.append(False)
    return bool_list

def split_map(str_):
    arr = str_.strip().split(' ')
    return arr

def process_face(dataset):

    temp = dataset['房屋朝向'].map(lambda x: split_map(x))
    dataset['东'] = 0
    dataset['南'] = 0
    dataset['西'] = 0
    dataset['北'] = 0
    dataset['东北'] = 0
    dataset['东南'] = 0
    dataset['西北'] = 0
    dataset['西南'] = 0
    bool_dong = check_bool(temp, '东')
    bool_nan = check_bool(temp, '南')
    bool_xi = check_bool(temp, '西')
    bool_bei = check_bool(temp, '北')
    bool_dongnan = check_bool(temp, '东南')
    bool_dongbei = check_bool(temp, '东北')
    bool_xinan = check_bool(temp, '西南')
    bool_xibei = check_bool(temp, '西北')
    dataset.loc[bool_dong, '东'] = 1
    dataset.loc[bool_nan, '南'] = 1
    dataset.loc[bool_xi, '西'] = 1
    dataset.loc[bool_bei, '北'] = 1
    dataset.loc[bool_dongnan, '东南'] = 1
    dataset.loc[bool_xibei, '西北'] = 1
    dataset.loc[bool_dongbei, '东北'] = 1
    dataset.loc[bool_xinan, '西南'] = 1
    dataset['东南'].fillna(0, inplace=True)
    dataset['东北'].fillna(0, inplace=True)
    dataset['西南'].fillna(0, inplace=True)
    dataset['西北'].fillna(0, inplace=True)

Xgb,Lgb学习

贴一下这次比赛我用过的参数~（偷懒直接复制Ipynb文件里的，有点乱）

XGB

犬哥的（无验证集）

  # xgb_val = xgb.DMatrix(X_test, label=y_test)
  xgb_train = xgb.DMatrix(train_data ,label=train_result)
  xgb_test = xgb.DMatrix(test_data)
  # xgbooster
  params = {
      'booster': 'gbtree',
      'objective': 'reg:linear',  # 多分类的问题
      'n_estimators': 2000,
      'gamma': 0.2,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
      'max_depth': 12,  # 构建树的深度，越大越容易过拟合
      "reg_alpha": 3,
      'lambda': 5,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
      'subsample': 0.9,  # 随机采样训练样本
      'colsample_bytree': 0.6,  # 生成树时进行的列采样
      'colsample_bylevel': 0.7,
      'min_child_weight': 7,
      # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
      # ，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
      # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
      'silent': 1,  # 设置成1则没有运行信息输出，最好是设置为0.
      'eta': 0.05,  # 如同学习率  0.007
      'seed': 2017,
      # 'nthread': 7,  # cpu 线程数
      # 'eval_metric': 'auc'
  }

  plst = list(params.items())
  num_rounds =10000 # 迭代次数
  # watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]

  # 训练模型并保存
  # early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
  model = xgb.train(plst, xgb_train, num_rounds, verbose_eval=50, )
  # model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=300, verbose_eval=50, )
  model.save_model('xgb.model')  # 用于存储训练出的模型
  print("模型训练完成")

我的

"import xgboost as xgb\n",
  "xgb_model = xgb.XGBRegressor(\n",
  "    colsample_bytree=0.4603,\n",
  "    gamma=0.0468,\n",
  "    learning_rate=0.05,\n",
  "    max_depth=5,\n",
  "    min_child_weight=1.7817,\n",
  "    n_estimators=2200,\n",
  "    reg_alpha=0.4640,\n",
  "    reg_lambda=0.8571,\n",
  "    subsample=0.5213,\n",
  "    silent=1,\n",
  "    random_state=7,\n",
  "    nthread=-1)"
 ]
},
{
 "cell_type": "code",
 "execution_count": 34,
 "metadata": {},
 "outputs": [
  {
   "data": {
    "text/plain": [
     "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
     "       colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05,\n",
     "       max_delta_step=0, max_depth=5, min_child_weight=1.7817,\n",
     "       missing=None, n_estimators=2200, n_jobs=1, nthread=-1,\n",
     "       objective='reg:linear', random_state=7, reg_alpha=0.464,\n",
     "       reg_lambda=0.8571, scale_pos_weight=1, seed=None, silent=1,\n",
     "       subsample=0.5213)"
    ]
   },
   "execution_count": 34,
   "metadata": {},
   "output_type": "execute_result"
  }
 ],
 "source": [
  "xgb_model.fit(x_train, y_train)"

LGB

犬哥的

# lgb
  train = lgb.Dataset(train_data, label=train_result)
  #test = lgb.Dataset(X_test, label=y_test, reference=train)
  # xgbooster
  params = {
      'boosting_type': 'gbdt',
      'objective': 'regression_l2',
      'metric': 'l2',
      # 'objective': 'multiclass',
      # 'metric': 'multi_error',
      # 'num_class':5,
      'min_child_weight': 3,
      'num_leaves': 2 ** 9,
      'lambda_l2': 5,
      #'subsample': 0.9,
      'colsample_bytree': 0.7,
      'colsample_bylevel': 0.7,
      'learning_rate': 0.05,
      'tree_method': 'exact',
      'seed': 2017,
      'nthread': 4,
      #'silent': False
  }

  plst = list(params.items())
  num_round = 14000

  # 训练模型并保存
  # early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
  gbm = lgb.train(params,
                  train,
                  num_round,
                  verbose_eval=50,
                  #early_stopping_rounds=300,
                  #valid_sets=[train, test]
                  )

  print("模型训练完成")

我的


params = {‘boosting_type’: ‘gbdt’,\n”,
  “          ‘num_leaves’: 28,\n”,
  “          ‘max_depth’: -1,\n”,
  “          ‘objective’: ‘regression’,\n”,
  “          ‘learning_rate’: 0.1,\n”,
  “          ‘seed’: 2018,\n”,
  “          ‘num_threads’: -1,\n”,
  “          ‘max_bin’: 425,\n”,
  “          \”metric\”: \”rmse\”,\n”,
  “          # \”lambda_l1\”: 0.1,\n”,
  “          \”lambda_l2\”: 0.2,\n”,
  “          }\n”,
  “\n”,
  “params2 = {\n”,
  “        ‘boosting_type’: ‘gbdt’,\n”,
  “        ‘objective’: ‘regression_l2’,\n”,
  “        ‘metric’: ‘l2’,\n”,
  “        # ‘objective’: ‘multiclass’,\n”,
  “        # ‘metric’: ‘multi_error’,\n”,
  “        # ‘num_class’:5,\n”,
  “        ‘min_child_weight’: 3,\n”,
  “        #’num_leaves’: 2 ** 9,\n”,
  “        ‘num_leaves’: 100,\n”,
  “        ‘lambda_l2’: 5,\n”,
  “        #’subsample’: 0.9,\n”,
  “        ‘colsample_bytree’: 0.7,\n”,
  “        ‘colsample_bylevel’: 0.7,\n”,
  “        ‘learning_rate’: 0.05,\n”,
  “        ‘tree_method’: ‘exact’,\n”,
  “        ‘seed’: 2017,\n”,
  “        ‘nthread’: 4,\n”,
  “        #’silent’: False\n”,
  “    }\n”,
  “\n”,
  “params3 = {‘boosting_type’: ‘gbdt’,\n”,
  “          ‘num_leaves’: 28,\n”,
  “          ‘max_depth’: -1,\n”,
  “          ‘objective’: ‘regression’,\n”,
  “          ‘learning_rate’: 0.05,\n”,
  “          ‘seed’: 2018,\n”,
  “          ‘num_threads’: -1,\n”,
  “          ‘max_bin’: 425,\n”,
  “          \”metric\”: \”rmse\”,\n”,
  “          # \”lambda_l1\”: 0.1,\n”,
  “          \”lambda_l2\”: 0.5,\n”,
  “          }\n”,
  “\n”,

clf = lgb.train(params2,\n”,
“ train_data,\n”,
“ num_boost_round=14000,\n”,
“ valid_sets=[train_data, val_data],\n”,
“ valid_names=[‘train’, ‘valid’],\n”,
“ early_stopping_rounds=100,\n”,
“ feval=None,\n”,
“ verbose_eval=50\n”,
“ )\n”,

冠军的参数见我自己笔记，Github的见Github~