Background Code for Final Project

The following code was run in support of the analysis present in “Peer Assessment II,” but was not included to limit the length of that report. These steps are saved here as a reference of analytical techniques.

library(tidyverse)
library(pander)

load("/Users/cojamalo/Documents/Duke - Capstone/ames_train.Rdata")

Find the best fits

This part of the code relies on formulas written in my “Multiple Linear Regression Tools” repository on Github. (row 146 of this code was changed to “row <- compare_fit(response, explan, data, target)[1:3,]”). First, all the linear fits for each explanatory variable was calculated and the within sample RMSE determined.

ames_fits = ames_train %>% mutate(log_price = log(price))
fits = find_best_trans(log_price, select(ames_fits, -price)) %>% arrange(RMSE)

[1] “Processing, please wait…..”

pandoc.table(head(fits))

variable	type	terms	adj_R_2	BIC	AICc	RMSE
log_price	Linear	1	1	-65020	-65034	5.225e-10
log_price	Square Root	1	0.9997	-6914	-6929	1898
log_price	Cubic Root	1	0.9994	-6329	-6344	2533
Overall.Qual	Linear	1	0.6855	-32.3	-47	45602
Overall.Qual	Square	1	0.6665	26.35	11.65	46205
Overall.Qual	Square Root	1	0.6824	-22.62	-37.32	46844

Then, any explanatory variable that does not have linear as its top three regression types are identified.

nonlin_cand = fits %>% group_by(variable) %>% filter(type != "Linear") %>% count() %>% filter(n > 2) %>% .$variable %>% droplevels %>% as.character
fits %>% .[fits$variable %in% nonlin_cand,] %>% group_by(variable) %>% top_n(1) %>% pandoc.table

variable	type	terms	adj_R_2	BIC	AICc	RMSE
area	Log	1	0.5461	334.5	319.8	57937
TotRms.AbvGrd	Log10	1	0.281	794.6	779.9	71285
Lot.Area	Log	1	0.1369	977.3	962.6	77165
Lot.Area	Log10	1	0.1369	977.3	962.6	77165
Lot.Area	Log2	1	0.1369	977.3	962.6	77165
Overall.Cond	Exp2	1	0.01251	1112	1097	82524
Kitchen.AbvGr	Square	1	0.008471	1116	1101	82916
BsmtFin.SF.2	Square Root	1	-2.247e-05	1121	1107	83239
Lot.Frontage	Reciprical	1	0.0908	929.6	915.5	85406
Mas.Vnr.Area	Cube	1	0.07603	1041	1026	89984
Garage.Yr.Blt	Log	1	0.3698	573.2	558.6	90987

The area in particular appears to be a good candidate for log-transformation.

Then, the RMSE curve is plotted as well as its derivative. This process helps to identify good cutoff points for selecting explanatory variables with low RMSE. Such variables are likely to be better predictors of log(price).

plot.data = data.frame(x=1:154,y=na.omit(fits$RMSE))
library(splines)                  
fit_ns <- lm(y ~ ns(x,26), plot.data); pred_ns <- predict(fit_ns, type="response")

cutoff_n = 24
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
    geom_point() + 
    xlim(1,160) +
    geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
    geom_vline(xintercept=cutoff_n, color ="red") +
    geom_hline(yintercept=RMSE, color ="red") +
    geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")

derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))

ggplot(derivative.data, aes(x,y)) +
    geom_line() +
    geom_vline(xintercept=cutoff_n)

In this case, the second local minima of the derivative was chosen as a good cutoff point. After this point, the remaining explanatory variables do not have significantly lower RMSE values.

Identifying Interaction Terms

The xgboost method first outlined at https://cojamalo.github.io/DATA-JHU-Machine-Learning-1/machine-learning.html is used to identify promising interaction terms. The xgboost regression algorithm is used in this case.

library(xgboost)
# Prep data for xgboost
train_x_num = as.matrix(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) ) *1.0
train_x_label = as.numeric(ames_train$price) * 1.0
train_x_matrix = xgb.DMatrix(data = train_x_num, label = train_x_label)
# Fit the model
bst <- xgboost(data = train_x_matrix,
               nround = 100, # default 100
               eta = 0.1, # default 0.3
               max.depth = 6, # default = 6 
               gamma = 0, # default 0, if train error >>> test error, bring gamma into action
               min_child_weight = 1, # default = 1
               subsample = 1, # default = 1
               colsample_bytree = 1, # default 1
               objective = "reg:linear",
               eval_metric = "rmse" )

## [1]  train-rmse:179902.156250 
## [2]  train-rmse:162884.500000 
## [3]  train-rmse:147557.296875 
## [4]  train-rmse:133716.515625 
## [5]  train-rmse:121283.289062 
## [6]  train-rmse:110057.554688 
## [7]  train-rmse:99962.687500 
## [8]  train-rmse:90816.203125 
## [9]  train-rmse:82555.734375 
## [10] train-rmse:75131.156250 
## [11] train-rmse:68467.398438 
## [12] train-rmse:62448.750000 
## [13] train-rmse:56995.617188 
## [14] train-rmse:52112.910156 
## [15] train-rmse:47697.988281 
## [16] train-rmse:43718.125000 
## [17] train-rmse:40064.261719 
## [18] train-rmse:36799.726562 
## [19] train-rmse:33812.542969 
## [20] train-rmse:31129.373047 
## [21] train-rmse:28742.972656 
## [22] train-rmse:26557.091797 
## [23] train-rmse:24605.554688 
## [24] train-rmse:22876.355469 
## [25] train-rmse:21301.115234 
## [26] train-rmse:19880.337891 
## [27] train-rmse:18599.048828 
## [28] train-rmse:17437.167969 
## [29] train-rmse:16392.171875 
## [30] train-rmse:15472.804688 
## [31] train-rmse:14666.987305 
## [32] train-rmse:13887.719727 
## [33] train-rmse:13229.636719 
## [34] train-rmse:12649.416016 
## [35] train-rmse:12069.635742 
## [36] train-rmse:11566.304688 
## [37] train-rmse:11161.256836 
## [38] train-rmse:10733.005859 
## [39] train-rmse:10405.709961 
## [40] train-rmse:10055.663086 
## [41] train-rmse:9772.096680 
## [42] train-rmse:9463.833008 
## [43] train-rmse:9191.530273 
## [44] train-rmse:8967.759766 
## [45] train-rmse:8749.434570 
## [46] train-rmse:8534.307617 
## [47] train-rmse:8395.595703 
## [48] train-rmse:8202.020508 
## [49] train-rmse:8047.352051 
## [50] train-rmse:7924.738770 
## [51] train-rmse:7821.708496 
## [52] train-rmse:7700.415527 
## [53] train-rmse:7575.965332 
## [54] train-rmse:7451.255371 
## [55] train-rmse:7364.022949 
## [56] train-rmse:7270.903809 
## [57] train-rmse:7118.129395 
## [58] train-rmse:7020.802734 
## [59] train-rmse:6946.157715 
## [60] train-rmse:6906.322266 
## [61] train-rmse:6818.943848 
## [62] train-rmse:6755.406738 
## [63] train-rmse:6641.370605 
## [64] train-rmse:6569.035156 
## [65] train-rmse:6418.296387 
## [66] train-rmse:6394.600586 
## [67] train-rmse:6311.553711 
## [68] train-rmse:6258.483398 
## [69] train-rmse:6230.929688 
## [70] train-rmse:6137.563477 
## [71] train-rmse:6072.286133 
## [72] train-rmse:6020.447266 
## [73] train-rmse:5903.588867 
## [74] train-rmse:5869.786133 
## [75] train-rmse:5845.750488 
## [76] train-rmse:5776.123535 
## [77] train-rmse:5678.084961 
## [78] train-rmse:5634.628418 
## [79] train-rmse:5580.410156 
## [80] train-rmse:5544.918457 
## [81] train-rmse:5522.836426 
## [82] train-rmse:5486.310059 
## [83] train-rmse:5433.422852 
## [84] train-rmse:5361.839355 
## [85] train-rmse:5334.131836 
## [86] train-rmse:5308.720215 
## [87] train-rmse:5245.541992 
## [88] train-rmse:5205.049316 
## [89] train-rmse:5114.533691 
## [90] train-rmse:5079.394531 
## [91] train-rmse:5052.958008 
## [92] train-rmse:4951.334961 
## [93] train-rmse:4934.740234 
## [94] train-rmse:4876.150391 
## [95] train-rmse:4797.468750 
## [96] train-rmse:4736.674805 
## [97] train-rmse:4695.234863 
## [98] train-rmse:4683.152344 
## [99] train-rmse:4645.882812 
## [100]    train-rmse:4627.557617

# plot the most important features
xgb.plot.importance(xgb.importance(colnames(train_x_num, do.NULL = TRUE, prefix = "col"), model = bst), top_n = 28)

# Dump the model to file for Xgbfi
featureList <- names(ames_train %>% mutate(log_area = log(area)) %>% select(-price,-area) %>%select_if(is.numeric) )
featureVector <- c() 
for (i in 1:length(featureList)) { 
  featureVector[i] <- paste(i-1, featureList[i], "q", sep="\t") 
}
fmap_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/fmap.txt"
dump_path = "/Users/cojamalo/Documents/GitHub/xgbfi/bin/xgb.dump"
write.table(featureVector, fmap_path, row.names=FALSE, quote = FALSE, col.names = FALSE)
xgb.dump(model = bst, fname = dump_path, fmap = fmap_path, with_stats = TRUE)

## [1] TRUE

Local command line was run with: cd ~/Documents/GitHub/xgbfi/bin; mono XgbFeatureInteractions.exe

library(xlsx)
xlsx_path = '/Users/cojamalo/Documents/GitHub/xgbfi/bin/XgbFeatureInteractions.xlsx'
depth0 = read.xlsx(xlsx_path, sheetIndex = 1) %>% tbl_df %>% mutate(interact_order = 1)
depth1 = read.xlsx(xlsx_path, sheetIndex = 2) %>% tbl_df %>% mutate(interact_order = 2)
depth2 = read.xlsx(xlsx_path, sheetIndex = 3) %>% tbl_df %>% mutate(interact_order = 3)


interact = bind_rows(depth0, depth1, depth2)
interact$interact_order = factor(interact$interact_order)

gains = interact %>% select(Interaction, Gain) %>% arrange(desc(Gain))

plot.data = data.frame(x=1:nrow(gains),y=gains$Gain)
library(splines)                  
fit_ns <- lm(y ~ ns(x,40), plot.data); pred_ns <- predict(fit_ns, type="response")

cutoff_n = 23
RMSE = predict(fit_ns, newdata=data.frame(x=cutoff_n),type="response")
ggplot(plot.data, aes(x=x,y)) +
    geom_point() + 
    xlim(1,nrow(gains)) +
    geom_line(aes(y = pred_ns), color = "green", lwd = 1) +
    geom_vline(xintercept=cutoff_n, color ="red") +
    geom_hline(yintercept=RMSE, color ="red") +
    geom_text(aes(6,RMSE,label = format(RMSE, digits = 3), vjust = -1), color ="red")

derivative.data = data.frame(x=rowMeans(embed(plot.data$x,2)), y=diff(pred_ns)/diff(plot.data$x))

ggplot(derivative.data, aes(x,y)) +
    geom_line() +
    geom_vline(xintercept=cutoff_n)

A cutoff for information gain was selected using the derivative technique mentioned above.

ggplot(interact %>% filter(Gain >= 2.28e+12, ) %>% mutate(Interaction = reorder(Interaction, Gain)), aes(y=Gain, x=Interaction)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 60, hjust = 1))

The top interaction terms were then plotted.