# load diabetes.data
using LinearAlgebra
using PyPlot
using CSV
Diabete_Data = CSV.read(download("https://web.stanford.edu/~hastie/Papers/LARS/diabetes.data"))
n,d = size(Diabete_Data)
d = d-1
# train-test split
using Random
Random.seed!(787)
rp = randperm(n)
split_ratio = 0.6
n_train = round(Int, n*split_ratio)
n_test = n - n_train
train_id = rp[1:n_train]
test_id = rp[n_train+1:end]
X = zeros(n,d)
y = zeros(n,1)
for i=1:d
X[:,i] = Diabete_Data[:,i]
end
X = [ones(n,1) X]
y = Diabete_Data[:,end]
X_train = X[train_id,:]
y_train = y[train_id]
X_test = X[test_id,:]
y_test = y[test_id]
println((n_train,n_test))
# compute the optimal theta
theta_opt = X_train\y_train
# achieved loss
trainMSE = norm(X_train*theta_opt-y_train)^2/n_train
testMSE = norm(X_test*theta_opt-y_test)^2/n_test
println("theta_opt: ", theta_opt)
println("trainMSE: ", trainMSE)
println("testMSE: ", testMSE)
# plot
using Printf
figure()
plot(y,y,"k")
plot(y_train,X_train*theta_opt,"o",alpha=0.4, label="train set: MSE=$(@sprintf("%.0f", trainMSE))")
plot(y_test,X_test*theta_opt,"o",alpha=0.4, label="test set: MSE=$(@sprintf("%.0f", testMSE))")
xlabel(L"$y$")
ylabel(L"predicted $y$")
legend()
title("prediction accuracy")
axis("square")
grid("on")