Boston House Price Prediction

6 minute read

Regression is basically a process which predicts the relationship between x and y based on features. This time we are going to practice Linear Regression with Boston House Price Data that are already embedded in scikit-learn datasets.

Useful functions

  • sklearn.metrics.mean_squared_error: famous evaluation method (MSE)

  • np.sqrt(x): square root of tensor x

  • linear_model.coef_ : get Regression coefficient of the fitted linear model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
BOSTON_DATA = datasets.load_boston()

Simple EDA

# Load both boston data and target, and convert it as dataframe.
def add_target_to_data(dataset):
    # make the raw dataset cleaner and easier to process -> use dataframe
    df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    # put the target data (price) to the dataframe we just made.
    print("Before adding target: ", df.shape)
    df['PRICE'] = dataset.target
    print("After adding target: {} \n {}\n".format(df.shape, df.head(2)))
    return df
"""
10 features as default.
Why didn't I put all the 13 features? Because n_row=2 and n_col=5 as default.
It will create 10 graphs for each features.
"""
def plotting_graph(df, features, n_row=2, n_col=5):
    fig, axes = plt.subplots(n_row, n_col, figsize=(16, 8))

    assert len(features) == n_row * n_col

    # Draw a regression graph using seaborn's regplot
    for i, feature in enumerate(features):
        row = int(i / n_col)
        col = i % n_col
        sns.regplot(x=feature, y='PRICE', data=df, ax=axes[row][col])

    plt.show()
def split_dataframe(df):
    label_data = df['PRICE']
    # others without PRICE
    # axis!! --> Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
    input_data = df.drop(['PRICE'], axis=1)

    # split! Set random_state if you want consistently same result
    input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=0.3,
                                                                        random_state=42)

    return input_train, input_eval, label_train, label_eval
boston_df = add_target_to_data(BOSTON_DATA)
features = ['RM', 'ZN', 'INDUS', 'NOX', 'AGE', 'PTRATIO', 'LSTAT', 'RAD', 'CRIM', 'B']
plotting_graph(boston_df, features, n_row=2, n_col=5)
Before adding target:  (506, 13)
After adding target: (506, 14) 
       CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   

   PTRATIO      B  LSTAT  PRICE  
0     15.3  396.9   4.98   24.0  
1     17.8  396.9   9.14   21.6  

png

'''
The correlation coefficient ranges from -1 to 1. 
If the value is close to 1, it means that there is a strong positive correlation between the two variables.
When it is close to -1, the variables have a strong negative correlation.
'''
correlation_matrix = boston_df.corr().round(2)

Output:

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29 -0.39 0.46 -0.39
ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39 0.18 -0.41 0.36
INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38 -0.36 0.60 -0.48
CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12 0.05 -0.05 0.18
NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19 -0.38 0.59 -0.43
RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36 0.13 -0.61 0.70
AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26 -0.27 0.60 -0.38
DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23 0.29 -0.50 0.25
RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46 -0.44 0.49 -0.38
TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46 -0.44 0.54 -0.47
PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00 -0.18 0.37 -0.51
B -0.39 0.18 -0.36 0.05 -0.38 0.13 -0.27 0.29 -0.44 -0.44 -0.18 1.00 -0.37 0.33
LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37 -0.37 1.00 -0.74
PRICE -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51 0.33 -0.74 1.00
sns.heatmap(correlation_matrix, cmap="YlGnBu")
plt.show()

png

Prediction with Linear Regression

X_train, X_test, Y_train, Y_test = split_dataframe(boston_df)

# Load your machine learning model
model = LinearRegression()
# Train!
model.fit(X_train, Y_train)
# make prediction with unseen data!
pred = model.predict(X_test)
expectation = Y_test
# what is mse between the answer and your prediction?
lr_mse = mean_squared_error(expectation, pred)
lr_rmse = np.sqrt(lr_mse)

print('LR_MSE: {0:.3f}, LR_RMSE: {1:.3F}'.format(lr_mse, lr_rmse))
# Regression Coefficient
print('Regression Coefficients:', np.round(model.coef_, 1))
# sort from the biggest
coeff = pd.Series(data=model.coef_, index=X_train.columns).sort_values(ascending=False)
print(coeff)

Output:

LR_MSE: 21.517, LR_RMSE: 4.639
Regression Coefficients: [ -0.1   0.    0.    3.1 -15.4   4.1  -0.   -1.4   0.2  -0.   -0.9   0.
  -0.5]
RM          4.057199
CHAS        3.119835
RAD         0.242727
INDUS       0.049523
ZN          0.035809
B           0.011794
TAX        -0.008702
AGE        -0.010821
CRIM       -0.133470
LSTAT      -0.547113
PTRATIO    -0.910685
DIS        -1.385998
NOX       -15.417061
dtype: float64
plt.scatter(expectation, pred)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('Expected price')
plt.ylabel('Predicted price')
plt.tight_layout()

png

Prediction with other Regression methods

  • Ridge, Lasso and ElasticNet
  • Gradient Boosting Regressor
  • XG Boost
  • SGD Regressor According to sklearn’s official documentation,

“SGDRegressor is well suited for regression problems with a large number of training samples (> 10,000), for other problems we recommend Ridge, Lasso, or ElasticNet.”

from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# Try tuning the hyper-parameters
models = {
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "ElasticNet" : ElasticNet(),
    "Gradient Boosting" : GradientBoostingRegressor(),
    "SGD" : SGDRegressor(max_iter=1000, tol=1e-3), 
    "XGB" : XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
}

pred_record = {}
for name, model in models.items():
    # Load your machine learning model
    curr_model = model
    # Train!
    curr_model.fit(X_train, Y_train)
    # make prediction with unseen data!
    pred = curr_model.predict(X_test)
    expectation = Y_test
    # what is mse between the answer and your prediction?
    mse = mean_squared_error(expectation, pred)
    rmse = np.sqrt(mse)

    print('{} MSE: {}, {} RMSE: {}'.format(name, mse, name, rmse))
    
    pred_record.update({name : pred})

Output:

Ridge MSE: 22.044053089861013, Ridge RMSE: 4.695109486461526
Lasso MSE: 25.639502928043992, Lasso RMSE: 5.063546477326341
ElasticNet MSE: 25.40519636428209, ElasticNet RMSE: 5.040356769543411
Gradient Boosting MSE: 7.737869423986623, Gradient Boosting RMSE: 2.781702612427616
SGD MSE: 1.5570858695910825e+28, SGD RMSE: 124783246855941.47
[20:06:46] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGB MSE: 82.71679422016705, XGB RMSE: 9.094877361469315
prediction = pred_record["SGD"]

plt.scatter(expectation, prediction)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('Expected price')
plt.ylabel('Predicted price')
plt.tight_layout()

png

prediction = pred_record["XGB"]

plt.scatter(expectation, prediction)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('Expected price')
plt.ylabel('Predicted price')
plt.tight_layout()

png

prediction = pred_record["Gradient Boosting"]

plt.scatter(expectation, prediction)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('Expected price')
plt.ylabel('Predicted price')
plt.tight_layout()

png

A Little Taster Session for Neural Network

from tensorflow import keras
from tensorflow.keras.layers import add, Dense, Activation

def neural_net():
    model = keras.Sequential()
    model.add(Dense(512, input_dim=BOSTON_DATA.data.shape[1]))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dense(1))
    return model
model = neural_net()
model.summary()

Output:

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_31 (Dense)             (None, 512)               7168      
_________________________________________________________________
activation_24 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 256)               131328    
_________________________________________________________________
activation_25 (Activation)   (None, 256)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 128)               32896     
_________________________________________________________________
activation_26 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 64)                8256      
_________________________________________________________________
activation_27 (Activation)   (None, 64)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 65        
=================================================================
Total params: 179,713
Trainable params: 179,713
Non-trainable params: 0
_________________________________________________________________
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=100)

loss, test_acc = model.evaluate(X_test, Y_test)
print('Test Loss : {:.4f} | Test Accuracy : {}'.format(loss, test_acc))
prediction = model.predict(X_test)

plt.scatter(expectation, prediction)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('Expected price')
plt.ylabel('Predicted price')
plt.tight_layout()

png

Leave a comment