The following code was run in support of the analysis present in “Peer Assessment II,” but was not included to limit the length of that report. These steps are saved here as a reference of analytical techniques.

library(tidyverse)
library(pander)
load("/Users/cojamalo/Documents/Duke - Capstone/ames_train.Rdata")

Find the best fits

This part of the code relies on formulas written in my “Multiple Linear Regression Tools” repository on Github. (row 146 of this code was changed to “row <- compare_fit(response, explan, data, target)[1:3,]”). First, all the linear fits for each explanatory variable was calculated and the within sample RMSE determined.

ames_fits = ames_train %>% mutate(log_price = log(price))
fits = find_best_trans(log_price, select(ames_fits, -price)) %>% arrange(RMSE)

[1] “Processing, please wait…..”

pandoc.table(head(fits))
variable type terms adj_R_2 BIC AICc RMSE
log_price Linear 1 1 -65020 -65034 5.225e-10
log_price Square Root 1 0.9997 -6914 -6929 1898
log_price Cubic Root 1 0.9994 -6329 -6344 2533
Overall.Qual Linear 1 0.6855 -32.3 -47 45602
Overall.Qual Square 1 0.6665 26.35 11.65 46205
Overall.Qual Square Root 1 0.6824 -22.62 -37.32 46844


Then, any explanatory variable that does not have linear as its top three regression types are identified.

nonlin_cand = fits %>% group_by(variable) %>% filter(type != "Linear") %>% count() %>% filter(n > 2) %>% .$variable %>% droplevels %>% as.character
fits %>% .[fits$variable %in% nonlin_cand,] %>% group_by(variable) %>% top_n(1) %>% pandoc.table
variable type terms adj_R_2 BIC AICc RMSE
area Log 1 0.5461 334.5 319.8 57937
TotRms.AbvGrd Log10 1 0.281 794.6 779.9 71285
Lot.Area Log 1 0.1369 977.3 962.6 77165
Lot.Area Log10 1 0.1369 977.3 962.6 77165
Lot.Area Log2 1 0.1369 977.3 962.6 77165
Overall.Cond Exp2 1 0.01251 1112 1097 82524
Kitchen.AbvGr Square 1 0.008471 1116 1101 82916
BsmtFin.SF.2 Square Root 1 -2.247e-05 1121 1107 83239
Lot.Frontage Reciprical 1 0.0908 929.6 915.5 85406
Mas.Vnr.Area Cube 1 0.07603 1041 1026 89984
Garage.Yr.Blt Log 1 0.3698 573.2 558.6 90987

The area in particular appears to be a good candidate for log-transformation.


Then, the RMSE curve is plotted as well as its derivative. This process helps to identify good cutoff points for selecting explanatory variables with low RMSE. Such variables are likely to be better predictors of log(price).

plot.data = data.frame(x=1:154,y=na.omit(fits$RMSE))
library(splines)                  
fit_ns <- lm(y ~ ns(x,26), plot.data); pred_ns <- predict(fit_ns, type="response")

cutoff_n = 24
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
    geom_point() + 
    xlim(1,160) +
    geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
    geom_vline(xintercept=cutoff_n, color ="red") +
    geom_hline(yintercept=RMSE, color ="red") +
    geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")

derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))

ggplot(derivative.data, aes(x,y)) +
    geom_line() +
    geom_vline(xintercept=cutoff_n)

In this case, the second local minima of the derivative was chosen as a good cutoff point. After this point, the remaining explanatory variables do not have significantly lower RMSE values.


Identifying Interaction Terms

The xgboost method first outlined at https://cojamalo.github.io/DATA-JHU-Machine-Learning-1/machine-learning.html is used to identify promising interaction terms. The xgboost regression algorithm is used in this case.

library(xgboost)
# Prep data for xgboost
train_x_num = as.matrix(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) ) *1.0
train_x_label = as.numeric(ames_train$price) * 1.0
train_x_matrix = xgb.DMatrix(data = train_x_num, label = train_x_label)
# Fit the model
bst <- xgboost(data = train_x_matrix,
               nround = 100, # default 100
               eta = 0.1, # default 0.3
               max.depth = 6, # default = 6 
               gamma = 0, # default 0, if train error >>> test error, bring gamma into action
               min_child_weight = 1, # default = 1
               subsample = 1, # default = 1
               colsample_bytree = 1, # default 1
               objective = "reg:linear",
               eval_metric = "rmse" )
## [1]  train-rmse:179902.156250 
## [2]  train-rmse:162884.500000 
## [3]  train-rmse:147557.296875 
## [4]  train-rmse:133716.515625 
## [5]  train-rmse:121283.289062 
## [6]  train-rmse:110057.554688 
## [7]  train-rmse:99962.687500 
## [8]  train-rmse:90816.203125 
## [9]  train-rmse:82555.734375 
## [10] train-rmse:75131.156250 
## [11] train-rmse:68467.398438 
## [12] train-rmse:62448.750000 
## [13] train-rmse:56995.617188 
## [14] train-rmse:52112.910156 
## [15] train-rmse:47697.988281 
## [16] train-rmse:43718.125000 
## [17] train-rmse:40064.261719 
## [18] train-rmse:36799.726562 
## [19] train-rmse:33812.542969 
## [20] train-rmse:31129.373047 
## [21] train-rmse:28742.972656 
## [22] train-rmse:26557.091797 
## [23] train-rmse:24605.554688 
## [24] train-rmse:22876.355469 
## [25] train-rmse:21301.115234 
## [26] train-rmse:19880.337891 
## [27] train-rmse:18599.048828 
## [28] train-rmse:17437.167969 
## [29] train-rmse:16392.171875 
## [30] train-rmse:15472.804688 
## [31] train-rmse:14666.987305 
## [32] train-rmse:13887.719727 
## [33] train-rmse:13229.636719 
## [34] train-rmse:12649.416016 
## [35] train-rmse:12069.635742 
## [36] train-rmse:11566.304688 
## [37] train-rmse:11161.256836 
## [38] train-rmse:10733.005859 
## [39] train-rmse:10405.709961 
## [40] train-rmse:10055.663086 
## [41] train-rmse:9772.096680 
## [42] train-rmse:9463.833008 
## [43] train-rmse:9191.530273 
## [44] train-rmse:8967.759766 
## [45] train-rmse:8749.434570 
## [46] train-rmse:8534.307617 
## [47] train-rmse:8395.595703 
## [48] train-rmse:8202.020508 
## [49] train-rmse:8047.352051 
## [50] train-rmse:7924.738770 
## [51] train-rmse:7821.708496 
## [52] train-rmse:7700.415527 
## [53] train-rmse:7575.965332 
## [54] train-rmse:7451.255371 
## [55] train-rmse:7364.022949 
## [56] train-rmse:7270.903809 
## [57] train-rmse:7118.129395 
## [58] train-rmse:7020.802734 
## [59] train-rmse:6946.157715 
## [60] train-rmse:6906.322266 
## [61] train-rmse:6818.943848 
## [62] train-rmse:6755.406738 
## [63] train-rmse:6641.370605 
## [64] train-rmse:6569.035156 
## [65] train-rmse:6418.296387 
## [66] train-rmse:6394.600586 
## [67] train-rmse:6311.553711 
## [68] train-rmse:6258.483398 
## [69] train-rmse:6230.929688 
## [70] train-rmse:6137.563477 
## [71] train-rmse:6072.286133 
## [72] train-rmse:6020.447266 
## [73] train-rmse:5903.588867 
## [74] train-rmse:5869.786133 
## [75] train-rmse:5845.750488 
## [76] train-rmse:5776.123535 
## [77] train-rmse:5678.084961 
## [78] train-rmse:5634.628418 
## [79] train-rmse:5580.410156 
## [80] train-rmse:5544.918457 
## [81] train-rmse:5522.836426 
## [82] train-rmse:5486.310059 
## [83] train-rmse:5433.422852 
## [84] train-rmse:5361.839355 
## [85] train-rmse:5334.131836 
## [86] train-rmse:5308.720215 
## [87] train-rmse:5245.541992 
## [88] train-rmse:5205.049316 
## [89] train-rmse:5114.533691 
## [90] train-rmse:5079.394531 
## [91] train-rmse:5052.958008 
## [92] train-rmse:4951.334961 
## [93] train-rmse:4934.740234 
## [94] train-rmse:4876.150391 
## [95] train-rmse:4797.468750 
## [96] train-rmse:4736.674805 
## [97] train-rmse:4695.234863 
## [98] train-rmse:4683.152344 
## [99] train-rmse:4645.882812 
## [100]    train-rmse:4627.557617


# plot the most important features
xgb.plot.importance(xgb.importance(colnames(train_x_num, do.NULL = TRUE, prefix = "col"), model = bst), top_n = 28)


# Dump the model to file for Xgbfi
featureList <- names(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) )
featureVector <- c() 
for (i in 1:length(featureList)) { 
  featureVector[i] <- paste(i-1, featureList[i], "q", sep="\t") 
}
fmap_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/fmap.txt"
dump_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/xgb.dump"
write.table(featureVector, fmap_path, row.names=FALSE, quote = FALSE, col.names = FALSE)
xgb.dump(model = bst, fname = dump_path, fmap = fmap_path, with_stats = TRUE)
## [1] TRUE


Local command line was run with: cd ~/Documents/GitHub/xgbfi/bin; mono XgbFeatureInteractions.exe


library(xlsx)
xlsx_path = '/Users/cojamalo/Documents/GitHub/xgbfi/bin/XgbFeatureInteractions.xlsx'
depth0 = read.xlsx(xlsx_path, sheetIndex = 1) %>% tbl_df %>% mutate(interact_order = 1)
depth1 = read.xlsx(xlsx_path, sheetIndex = 2) %>% tbl_df %>% mutate(interact_order = 2)
depth2 = read.xlsx(xlsx_path, sheetIndex = 3) %>% tbl_df %>% mutate(interact_order = 3)


interact = bind_rows(depth0, depth1, depth2)
interact$interact_order = factor(interact$interact_order)


gains = interact %>% select(Interaction, Gain) %>% arrange(desc(Gain))

plot.data = data.frame(x=1:nrow(gains),y=gains$Gain)
library(splines)                  
fit_ns <- lm(y ~ ns(x,40), plot.data); pred_ns <- predict(fit_ns, type="response")

cutoff_n = 23
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
    geom_point() + 
    xlim(1,nrow(gains)) +
    geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
    geom_vline(xintercept=cutoff_n, color ="red") +
    geom_hline(yintercept=RMSE, color ="red") +
    geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")

derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))

ggplot(derivative.data, aes(x,y)) +
    geom_line() +
    geom_vline(xintercept=cutoff_n)

A cutoff for information gain was selected using the derivative technique mentioned above.


ggplot(interact %>% filter(Gain >= 2.28e+12, ) %>% mutate(Interaction = reorder(Interaction, Gain)), aes(y=Gain, x=Interaction)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 60, hjust = 1))

The top interaction terms were then plotted.