using LinearAlgebra, Random, Statistics;
using DataFrames, CSV;
using PyPlot, Printf;
# load data
house_price_data = CSV.read("train.csv")
# remove outliers
house_price_data = house_price_data[house_price_data.GrLivArea .< 4000, :];
# raw output
target = "SalePrice";
v = house_price_data[!,Symbol(target)];
v = log.(v);
# select features
feature_list = ["LotArea", "YearBuilt", "YearRemodAdd", "TotalBsmtSF",
"GrLivArea", "1stFlrSF", "2ndFlrSF", "BedroomAbvGr",
"KitchenAbvGr", "Fireplaces", "GarageArea", "WoodDeckSF",
"HalfBath", "TotRmsAbvGrd", "OverallCond", "OverallQual"];
feature_names = deepcopy(feature_list)
n = size(house_price_data,1)
U = zeros(n);
for feature in feature_list;
#println(feature);
U = [U house_price_data[!,Symbol(feature)]];
end;
U = U[:,2:end];
# additional (one-hot) features
more_oh_features = ["Neighborhood", "BldgType", "KitchenQual"];
function to_one_hot(u)
idx_to_category = unique(u)
category_to_idx = Dict(idx_to_category[i] => i for i = 1:length(idx_to_category))
X_one_hot = zeros(length(u), length(idx_to_category))
for i = 1:length(u)
X_one_hot[i, category_to_idx[u[i]]] = 1
end
return X_one_hot
end
for feature in more_oh_features;
U = [U to_one_hot(house_price_data[!,Symbol(feature)])];
feature_names = [feature_names; unique(house_price_data[!,Symbol(feature)])]
end;
# additional (numerical) features
more_num_features = ["GarageCars"];
for feature in more_num_features;
U = [U house_price_data[!,Symbol(feature)]];
feature_names = [feature_names; feature]
end;
# advanced features
adv_log_features = ["LotArea", "GrLivArea"];
for feature in adv_log_features;
U = [U log.(house_price_data[!,Symbol(feature)])];
end;
feature_names = [feature_names; "logLotArea"; "logGrLivArea"];
GrLivArea_1000_plus = (house_price_data[!,:GrLivArea] .> 1000)*1;
GrLivArea_600_minus = (house_price_data[!,:GrLivArea] .< 600)*1;
LotArea_6000_plus = (house_price_data[!,:LotArea] .> 6000)*1;
LotArea_4000_minus = (house_price_data[!,:LotArea] .< 4000)*1;
U = [U GrLivArea_1000_plus GrLivArea_600_minus LotArea_6000_plus LotArea_4000_minus];
feature_names = [feature_names; "GrLivArea_1000_plus"; "GrLivArea_600_minus"; "LotArea_6000_plus"; "LotArea_4000_minus"];
n,d = size(U)
print((n,d))
# data split
Random.seed!(100);
rp = randperm(n);
split_ratio = 0.7;
n_train = round(Int, n*split_ratio);
n_test = n - n_train;
train_id = rp[1:n_train];
test_id = rp[n_train+1:end];
U_train = U[train_id,:];
v_train = v[train_id,:];
U_test = U[test_id,:];
v_test = v[test_id,:];