Linear Regression is the oldest, most basic predictive modeling (or supervised learning) technique
Wages dataset is a simulated dataset based on a real dataset published in Data Analysis using Regression and Multilevel/Hierarchical Models by Andrew Gelman and Jennifer Hill
Variables include:
setwd('C:/Users/Downloads') # on Windows
setwd('/Users/Downloads') # on Mac
wage = read.csv("wage.csv")
library(ggplot2)
ggplot(data=wages,aes(x=earn))+
geom_histogram(binwidth=5000,fill='cadetblue')
#Remove negative earning
wages = wages[wages$earn>=0,]
To evaluate built data, avoiding overfitting. Here, we split data into 70-30, set groups to 100 and use a seed of 1031.
set.seed(1031)
library(caret)
split = createDataPartition(y = houses$price, p = 0.7, list = F, groups = 100)
train = wages[split,]
test = wages[-split,]
ggplot(data=train,aes(x='',y=earn))+
geom_boxplot(outlier.color='red',outlier.alpha=0.5, fill='cadetblue')+
geom_text(aes(x='',y=median(train$earn),label=median(train$earn)),size=3,hjust=11)+
xlab(label = '')
cor(train$age,train$earn)
ggplot(data=train,aes(x=age,y=earn))+
geom_point()+
geom_smooth(method='lm',size=1.3,color='steelblue3')+
coord_cartesian(ylim=c(0,200000))
Estimate Regression Equation:
model1 = lm(earn~age,data=train)
pred = predict(model1)
data.frame(earn = train$earn[100:109], prediction = pred[100:109])
summary(model1)
R-Squared
#Another way of acquiring R-squared
sse = sum((pred - train$earn)^2)
sst = sum((mean(train$earn)-train$earn)^2)
model1_r2 = 1 - sse/sst; model1_r2
#RMSE
rmse1 = sqrt(mean((pred-train$earn)^2)); rmse1
Does age influence earn? Yes, based on the coefficient, older age has positive impact on earn. (model1$coef[1]+ model1$coef[2]* age)
model2 = lm(earn~gender,data=train)
class(train$gender)
levels(train$gender)
summary(model2)
pred = predict(model2)
sse2 = sum((pred - train$earn)^2)
sst2 = sum((mean(train$earn)-train$earn)^2)
model2_r2 = 1 - sse2/sse2; model2_r2
rmse2 = sqrt(mean((pred-train$earn)^2)); rmse2
model = lm(earn~height+gender+race+ed+age,data=train)
summary(model)
#Predict: Out of Sample
pred = predict(model, newdata=test)
sse_test = sum((pred - test$earn)^2)
sst_test = sum((mean(train$earn)-test$earn)^2)
model_r2_test = 1 - sse_test/sst_test; model_r2_test
model = lm(earn ~ height + gender, height * gender, data = train)