The following code was run in support of the analysis present in “Peer Assessment II,” but was not included to limit the length of that report. These steps are saved here as a reference of analytical techniques.
library(tidyverse)
library(pander)
load("/Users/cojamalo/Documents/Duke - Capstone/ames_train.Rdata")
This part of the code relies on formulas written in my “Multiple Linear Regression Tools” repository on Github. (row 146 of this code was changed to “row <- compare_fit(response, explan, data, target)[1:3,]”). First, all the linear fits for each explanatory variable was calculated and the within sample RMSE determined.
ames_fits = ames_train %>% mutate(log_price = log(price))
fits = find_best_trans(log_price, select(ames_fits, -price)) %>% arrange(RMSE)
[1] “Processing, please wait…..”
pandoc.table(head(fits))
variable | type | terms | adj_R_2 | BIC | AICc | RMSE |
---|---|---|---|---|---|---|
log_price | Linear | 1 | 1 | -65020 | -65034 | 5.225e-10 |
log_price | Square Root | 1 | 0.9997 | -6914 | -6929 | 1898 |
log_price | Cubic Root | 1 | 0.9994 | -6329 | -6344 | 2533 |
Overall.Qual | Linear | 1 | 0.6855 | -32.3 | -47 | 45602 |
Overall.Qual | Square | 1 | 0.6665 | 26.35 | 11.65 | 46205 |
Overall.Qual | Square Root | 1 | 0.6824 | -22.62 | -37.32 | 46844 |
Then, any explanatory variable that does not have linear as its top three regression types are identified.
nonlin_cand = fits %>% group_by(variable) %>% filter(type != "Linear") %>% count() %>% filter(n > 2) %>% .$variable %>% droplevels %>% as.character
fits %>% .[fits$variable %in% nonlin_cand,] %>% group_by(variable) %>% top_n(1) %>% pandoc.table
variable | type | terms | adj_R_2 | BIC | AICc | RMSE |
---|---|---|---|---|---|---|
area | Log | 1 | 0.5461 | 334.5 | 319.8 | 57937 |
TotRms.AbvGrd | Log10 | 1 | 0.281 | 794.6 | 779.9 | 71285 |
Lot.Area | Log | 1 | 0.1369 | 977.3 | 962.6 | 77165 |
Lot.Area | Log10 | 1 | 0.1369 | 977.3 | 962.6 | 77165 |
Lot.Area | Log2 | 1 | 0.1369 | 977.3 | 962.6 | 77165 |
Overall.Cond | Exp2 | 1 | 0.01251 | 1112 | 1097 | 82524 |
Kitchen.AbvGr | Square | 1 | 0.008471 | 1116 | 1101 | 82916 |
BsmtFin.SF.2 | Square Root | 1 | -2.247e-05 | 1121 | 1107 | 83239 |
Lot.Frontage | Reciprical | 1 | 0.0908 | 929.6 | 915.5 | 85406 |
Mas.Vnr.Area | Cube | 1 | 0.07603 | 1041 | 1026 | 89984 |
Garage.Yr.Blt | Log | 1 | 0.3698 | 573.2 | 558.6 | 90987 |
The area in particular appears to be a good candidate for log-transformation.
Then, the RMSE curve is plotted as well as its derivative. This process helps to identify good cutoff points for selecting explanatory variables with low RMSE. Such variables are likely to be better predictors of log(price).
plot.data = data.frame(x=1:154,y=na.omit(fits$RMSE))
library(splines)
fit_ns <- lm(y ~ ns(x,26), plot.data); pred_ns <- predict(fit_ns, type="response")
cutoff_n = 24
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
geom_point() +
xlim(1,160) +
geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
geom_vline(xintercept=cutoff_n, color ="red") +
geom_hline(yintercept=RMSE, color ="red") +
geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")
derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))
ggplot(derivative.data, aes(x,y)) +
geom_line() +
geom_vline(xintercept=cutoff_n)
In this case, the second local minima of the derivative was chosen as a good cutoff point. After this point, the remaining explanatory variables do not have significantly lower RMSE values.
The xgboost method first outlined at https://cojamalo.github.io/DATA-JHU-Machine-Learning-1/machine-learning.html is used to identify promising interaction terms. The xgboost regression algorithm is used in this case.
library(xgboost)
# Prep data for xgboost
train_x_num = as.matrix(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) ) *1.0
train_x_label = as.numeric(ames_train$price) * 1.0
train_x_matrix = xgb.DMatrix(data = train_x_num, label = train_x_label)
# Fit the model
bst <- xgboost(data = train_x_matrix,
nround = 100, # default 100
eta = 0.1, # default 0.3
max.depth = 6, # default = 6
gamma = 0, # default 0, if train error >>> test error, bring gamma into action
min_child_weight = 1, # default = 1
subsample = 1, # default = 1
colsample_bytree = 1, # default 1
objective = "reg:linear",
eval_metric = "rmse" )
## [1] train-rmse:179902.156250
## [2] train-rmse:162884.500000
## [3] train-rmse:147557.296875
## [4] train-rmse:133716.515625
## [5] train-rmse:121283.289062
## [6] train-rmse:110057.554688
## [7] train-rmse:99962.687500
## [8] train-rmse:90816.203125
## [9] train-rmse:82555.734375
## [10] train-rmse:75131.156250
## [11] train-rmse:68467.398438
## [12] train-rmse:62448.750000
## [13] train-rmse:56995.617188
## [14] train-rmse:52112.910156
## [15] train-rmse:47697.988281
## [16] train-rmse:43718.125000
## [17] train-rmse:40064.261719
## [18] train-rmse:36799.726562
## [19] train-rmse:33812.542969
## [20] train-rmse:31129.373047
## [21] train-rmse:28742.972656
## [22] train-rmse:26557.091797
## [23] train-rmse:24605.554688
## [24] train-rmse:22876.355469
## [25] train-rmse:21301.115234
## [26] train-rmse:19880.337891
## [27] train-rmse:18599.048828
## [28] train-rmse:17437.167969
## [29] train-rmse:16392.171875
## [30] train-rmse:15472.804688
## [31] train-rmse:14666.987305
## [32] train-rmse:13887.719727
## [33] train-rmse:13229.636719
## [34] train-rmse:12649.416016
## [35] train-rmse:12069.635742
## [36] train-rmse:11566.304688
## [37] train-rmse:11161.256836
## [38] train-rmse:10733.005859
## [39] train-rmse:10405.709961
## [40] train-rmse:10055.663086
## [41] train-rmse:9772.096680
## [42] train-rmse:9463.833008
## [43] train-rmse:9191.530273
## [44] train-rmse:8967.759766
## [45] train-rmse:8749.434570
## [46] train-rmse:8534.307617
## [47] train-rmse:8395.595703
## [48] train-rmse:8202.020508
## [49] train-rmse:8047.352051
## [50] train-rmse:7924.738770
## [51] train-rmse:7821.708496
## [52] train-rmse:7700.415527
## [53] train-rmse:7575.965332
## [54] train-rmse:7451.255371
## [55] train-rmse:7364.022949
## [56] train-rmse:7270.903809
## [57] train-rmse:7118.129395
## [58] train-rmse:7020.802734
## [59] train-rmse:6946.157715
## [60] train-rmse:6906.322266
## [61] train-rmse:6818.943848
## [62] train-rmse:6755.406738
## [63] train-rmse:6641.370605
## [64] train-rmse:6569.035156
## [65] train-rmse:6418.296387
## [66] train-rmse:6394.600586
## [67] train-rmse:6311.553711
## [68] train-rmse:6258.483398
## [69] train-rmse:6230.929688
## [70] train-rmse:6137.563477
## [71] train-rmse:6072.286133
## [72] train-rmse:6020.447266
## [73] train-rmse:5903.588867
## [74] train-rmse:5869.786133
## [75] train-rmse:5845.750488
## [76] train-rmse:5776.123535
## [77] train-rmse:5678.084961
## [78] train-rmse:5634.628418
## [79] train-rmse:5580.410156
## [80] train-rmse:5544.918457
## [81] train-rmse:5522.836426
## [82] train-rmse:5486.310059
## [83] train-rmse:5433.422852
## [84] train-rmse:5361.839355
## [85] train-rmse:5334.131836
## [86] train-rmse:5308.720215
## [87] train-rmse:5245.541992
## [88] train-rmse:5205.049316
## [89] train-rmse:5114.533691
## [90] train-rmse:5079.394531
## [91] train-rmse:5052.958008
## [92] train-rmse:4951.334961
## [93] train-rmse:4934.740234
## [94] train-rmse:4876.150391
## [95] train-rmse:4797.468750
## [96] train-rmse:4736.674805
## [97] train-rmse:4695.234863
## [98] train-rmse:4683.152344
## [99] train-rmse:4645.882812
## [100] train-rmse:4627.557617
# plot the most important features
xgb.plot.importance(xgb.importance(colnames(train_x_num, do.NULL = TRUE, prefix = "col"), model = bst), top_n = 28)
# Dump the model to file for Xgbfi
featureList <- names(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) )
featureVector <- c()
for (i in 1:length(featureList)) {
featureVector[i] <- paste(i-1, featureList[i], "q", sep="\t")
}
fmap_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/fmap.txt"
dump_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/xgb.dump"
write.table(featureVector, fmap_path, row.names=FALSE, quote = FALSE, col.names = FALSE)
xgb.dump(model = bst, fname = dump_path, fmap = fmap_path, with_stats = TRUE)
## [1] TRUE
Local command line was run with: cd ~/Documents/GitHub/xgbfi/bin; mono XgbFeatureInteractions.exe
library(xlsx)
xlsx_path = '/Users/cojamalo/Documents/GitHub/xgbfi/bin/XgbFeatureInteractions.xlsx'
depth0 = read.xlsx(xlsx_path, sheetIndex = 1) %>% tbl_df %>% mutate(interact_order = 1)
depth1 = read.xlsx(xlsx_path, sheetIndex = 2) %>% tbl_df %>% mutate(interact_order = 2)
depth2 = read.xlsx(xlsx_path, sheetIndex = 3) %>% tbl_df %>% mutate(interact_order = 3)
interact = bind_rows(depth0, depth1, depth2)
interact$interact_order = factor(interact$interact_order)
gains = interact %>% select(Interaction, Gain) %>% arrange(desc(Gain))
plot.data = data.frame(x=1:nrow(gains),y=gains$Gain)
library(splines)
fit_ns <- lm(y ~ ns(x,40), plot.data); pred_ns <- predict(fit_ns, type="response")
cutoff_n = 23
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
geom_point() +
xlim(1,nrow(gains)) +
geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
geom_vline(xintercept=cutoff_n, color ="red") +
geom_hline(yintercept=RMSE, color ="red") +
geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")
derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))
ggplot(derivative.data, aes(x,y)) +
geom_line() +
geom_vline(xintercept=cutoff_n)
A cutoff for information gain was selected using the derivative technique mentioned above.
ggplot(interact %>% filter(Gain >= 2.28e+12, ) %>% mutate(Interaction = reorder(Interaction, Gain)), aes(y=Gain, x=Interaction)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
The top interaction terms were then plotted.