House price example

$$ \newcommand{\eg}{{\it e.g.}} \newcommand{\ie}{{\it i.e.}} \newcommand{\argmin}{\operatornamewithlimits{argmin}} \newcommand{\mc}{\mathcal} \newcommand{\mb}{\mathbb} \newcommand{\mf}{\mathbf} \newcommand{\minimize}{{\text{minimize}}} \newcommand{\diag}{{\text{diag}}} \newcommand{\cond}{{\text{cond}}} \newcommand{\rank}{{\text{rank }}} \newcommand{\range}{{\mathcal{R}}} \newcommand{\null}{{\mathcal{N}}} \newcommand{\tr}{{\text{trace}}} \newcommand{\dom}{{\text{dom}}} \newcommand{\dist}{{\text{dist}}} \newcommand{\R}{\mathbf{R}} \newcommand{\SM}{\mathbf{S}} \newcommand{\ball}{\mathcal{B}} \newcommand{\bmat}[1]{\begin{bmatrix}#1\end{bmatrix}} $$

EE787: Machine learning, Kyung Hee University.
Jong-Han Kim (jonghank@khu.ac.kr)

In [3]:
using LinearAlgebra, Random, Statistics;
using DataFrames, CSV;
using PyPlot, Printf;

# load data
house_price_data = CSV.read("train.csv")

# remove outliers
house_price_data = house_price_data[house_price_data.GrLivArea .< 4000, :];

# raw output
target = "SalePrice";
v = house_price_data[!,Symbol(target)];
v = log.(v);

# select features
feature_list = ["LotArea", "YearBuilt", "YearRemodAdd", "TotalBsmtSF",
    "GrLivArea", "1stFlrSF", "2ndFlrSF", "BedroomAbvGr", 
    "KitchenAbvGr", "Fireplaces", "GarageArea", "WoodDeckSF", 
    "HalfBath", "TotRmsAbvGrd", "OverallCond", "OverallQual"];
feature_names = deepcopy(feature_list)

n = size(house_price_data,1)
U = zeros(n);
for feature in feature_list;
    #println(feature);
    U = [U house_price_data[!,Symbol(feature)]];
end;

U = U[:,2:end];

# additional (one-hot) features
more_oh_features = ["Neighborhood", "BldgType", "KitchenQual"];

function to_one_hot(u)
    idx_to_category = unique(u)
    category_to_idx = Dict(idx_to_category[i] => i for i = 1:length(idx_to_category))
    X_one_hot = zeros(length(u), length(idx_to_category))

    for i = 1:length(u)
        X_one_hot[i, category_to_idx[u[i]]] = 1
    end

    return X_one_hot
end

for feature in more_oh_features;
    U = [U to_one_hot(house_price_data[!,Symbol(feature)])];
    feature_names = [feature_names; unique(house_price_data[!,Symbol(feature)])] 
end;

# additional (numerical) features
more_num_features = ["GarageCars"];
for feature in more_num_features;
    U = [U house_price_data[!,Symbol(feature)]];
    feature_names = [feature_names; feature]     
end;

# advanced features
adv_log_features = ["LotArea", "GrLivArea"];
for feature in adv_log_features;
    U = [U log.(house_price_data[!,Symbol(feature)])];
end;
feature_names = [feature_names; "logLotArea"; "logGrLivArea"];

GrLivArea_1000_plus = (house_price_data[!,:GrLivArea] .> 1000)*1;
GrLivArea_600_minus = (house_price_data[!,:GrLivArea] .<  600)*1;
LotArea_6000_plus = (house_price_data[!,:LotArea] .> 6000)*1;
LotArea_4000_minus = (house_price_data[!,:LotArea] .< 4000)*1;
U = [U GrLivArea_1000_plus GrLivArea_600_minus LotArea_6000_plus LotArea_4000_minus];
feature_names = [feature_names; "GrLivArea_1000_plus"; "GrLivArea_600_minus"; "LotArea_6000_plus"; "LotArea_4000_minus"];

n,d = size(U)

print((n,d))

# data split
Random.seed!(100);
rp = randperm(n);
split_ratio = 0.7;
n_train = round(Int, n*split_ratio);
n_test = n - n_train;

train_id = rp[1:n_train];
test_id = rp[n_train+1:end];

U_train = U[train_id,:];
v_train = v[train_id,:];
U_test = U[test_id,:];
v_test = v[test_id,:];
(1456, 57)
In [2]:
# your code here