Boston房价预测

Boston房价预测

输入数据要求

  • 回归问题
  • 标准化处理输入数据
  • k-折评估少样本模型
  • 滑动均值平滑epoch均方差观测图

输入数据是向量,对应13个特征,且每个特征都有不同的取值范围.标签是标量,代表房价中位数。样本数很少,404个训练样本和102个测试样本。

train_data.shape#(404, 13)
test_data.shape#(102, 13)
train_targets.shape#(404,)
### 数据导入
from keras.datasets import boston_housing

(train_data, train_targets), (test_data, test_targets) =  boston_housing.load_data()
### 数据观测
train_data.shape#(404, 13)
test_data.shape#(102, 13)
train_targets.shape#(404,)
### 数据处理
#目的是均衡取值范围,将特征标准化。将取值范围差别很大的输入神经网络会产生很大问题。
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std

test_data -= mean
test_data /= std
### 模型搭建
#函数式编程返回编译好的模型
from keras import models
from keras import layers

def build_model():
    # Because we will need to instantiate
    # the same model multiple times,
    # we use a function to construct it.
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',
                           input_shape=(train_data.shape[1],)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])#均方差
    return model

模型训练与评估

  • k折交叉验证
  • <对少样本数据进行可靠的评估>

    #样本很少,验证分数会有很大波动,验证集的划分方式可能会造成验证分数上很大的方差
    import numpy as np

k = 4
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []
for i in range(k):
print(‘processing fold #’, i)

# Prepare the validation data: data from partition # k
val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]

# Prepare the training data: data from all other partitions
partial_train_data = np.concatenate(
    [train_data[:i * num_val_samples],
     train_data[(i + 1) * num_val_samples:]],
    axis=0)
partial_train_targets = np.concatenate(
    [train_targets[:i * num_val_samples],
     train_targets[(i + 1) * num_val_samples:]],
    axis=0)

# Build the Keras model (already compiled)
model = build_model()
# Train the model (in silent mode, verbose=0)
model.fit(partial_train_data, partial_train_targets,
          epochs=num_epochs, batch_size=1, verbose=0)
# Evaluate the model on the validation data
val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
all_scores.append(val_mae)

#all_scores

#[2.0750808349930412, 2.117215852926273, 2.9140411863232605, 2.4288365227161068]

#np.mean(all_scores)

#2.3837935992396706

模型评估

from keras import backend as K

# Some memory clean-up
K.clear_session()
# 延长训练时间达到500轮次,为了记录模型在每轮的表现,我们需要修改训练循环,以保存每轮的验证分数记录
num_epochs = 500
all_mae_histories = []
for i in range(k):
    print('processing fold #', i)
    # Prepare the validation data: data from partition # k
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]

    # Prepare the training data: data from all other partitions
    partial_train_data = np.concatenate(
        [train_data[:i * num_val_samples],
         train_data[(i + 1) * num_val_samples:]],
        axis=0)
    partial_train_targets = np.concatenate(
        [train_targets[:i * num_val_samples],
         train_targets[(i + 1) * num_val_samples:]],
        axis=0)

    # Build the Keras model (already compiled)
    model = build_model()
    # Train the model (in silent mode, verbose=0)
    history = model.fit(partial_train_data, partial_train_targets,
                        validation_data=(val_data, val_targets),
                        epochs=num_epochs, batch_size=1, verbose=0)
    mae_history = history.history['val_mean_absolute_error']
    all_mae_histories.append(mae_history)


#计算每个轮次中所有折MAE,取每轮的k个模型的均值
#all_mae_histories.shape()
#[4,500]

average_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]
#average_mae_history.shape()
#[500,]

#绘制按轮次的验证分数
import matplotlib.pyplot as plt

plt.plot(range(1, len(average_mae_history) + 1), average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

# 得知前10个epoch异常,先删除前10点
#之后波动过大,采用滑动均值模型得到光滑曲线继续观测
def smooth_curve(points, factor=0.9):
  smoothed_points = []
  for point in points:
    if smoothed_points:
      previous = smoothed_points[-1]
      smoothed_points.append(previous * factor + point * (1 - factor))
    else:
      smoothed_points.append(point)
  return smoothed_points

smooth_mae_history = smooth_curve(average_mae_history[10:])

plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

### 模型优化 >画图得知模型在8轮后过拟合了,所以8轮时停止
# Get a fresh, compiled model.
model = build_model()
# Train it on the entirety of the data.
model.fit(train_data, train_targets,
          epochs=80, batch_size=16, verbose=0)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)
#test_mae_score
#2.5532484335057877

### 模型使用 >model.predict(x_test)#估计是好评的概率
array([[0.20031697],
       [0.99983263],
       [0.9702985 ],
       ...,
       [0.12498155],
       [0.05971182],
       [0.7418488 ]], dtype=float32)