Trees

6 minute read

Tree classifiers in R

library(data.table)
library(tree)
library(readr)

NY <- read_delim("C:/Users/D/Desktop/NY.csv", ";", escape_double = FALSE, trim_ws = TRUE)
library(dplyr)
attach(NY)

NY$hp = as.factor(ifelse(NY$price <= 13.52, "Low", "High"))
NY$ub = as.factor(ifelse(NY$cab_type == 'Uber', "Uber", "Lyft"))

The first tree for the Uber and Lyft dataset

tree.di <- tree(ub ~ price+distance, data=NY)
summary(tree.di)
plot(tree.di)
text(tree.di, cex=.75)
title(main = "Unpruned Classification Tree")

Here’s the plot for our basic tree: alt

Plots partitions of tree

price.bi <- quantile(NY$price, 0:4/4)
cut.prices <- cut(NY$price, price.bi, include.lowest=TRUE)

plot(NY$price, NY$distance, col=hcl(10:2/11)[cut.prices], pch=15, 
     xlab="Price",ylab="Distance", main='Partition tree: Uber vs Lyft', cex=.75)
partition.tree(tree.di, ordvars=c("price","distance"), add=TRUE, col='blue', cex=.7)

plot(NY$price, NY$distance, col=hcl(10:2/11)[cut.prices], pch=15, 
     xlab="Price",ylab="Distance", main='Partition best tree', cex=.75)
partition.tree(tree.2, ordvars=c("price","distance"), add=TRUE, col='blue',cex=.7)

plot(NY$price, NY$distance, col=hcl(10:2/11)[cut.prices], pch=15, 
     xlab="Price",ylab="Distance", main='Partition pruneed tree', cex=.75)
partition.tree(pruned.tree, ordvars=c("price","distance"), add=TRUE, col='blue',cex=.7)

Here’s the partition plot for our basic tree: alt

A more stylized tree `mindev=0.001`

tree.2 <- tree(ub ~ price+distance, data=NY, mindev=0.001)
summary(tree.2)
plot(tree.2)
text(tree.2, cex=.6)
title(main = "Unpruned Classification Tree")

Here’s the plot for our more stylized tree: alt

plot(NY$price, NY$distance, col=hcl(10:2/11)[cut.prices], pch=15, 
     xlab="Price",ylab="Distance", main='Pruned partition tree: Uber vs Lyft', 
     cex=.75)
partition.tree(pruned.tree, ordvars=c("price","distance"), add=TRUE)

Here’s the partition plot for our stylized tree: alt

Making the predictions

set.seed(45)
intrain  <- sample(1:nrow(NY), 0.7*nrow(NY))
trainny<- NY[intrain,]
testny  <- NY[-intrain,]

predt1 <- predict(tree.di, testny)
predt <- predict(tree.2, testny) # gives the probability for each class
head(predt)

Point prediction

Let’s translate the probability output to categorical output

maxidx <- function(arr) {
    return(which(arr == max(arr)))
}

idx <- apply(predt1, c(1), maxidx)
prediction <- c('Lyft', 'Uber')[idx]
table(prediction, testny$cab_type)

Another way to show the data: however it takes too much!!!

plot(NY$price, NY$distance, pch=19, col=as.numeric(NY$ub))
partition.tree(tree.di, label="cab_type", add=T)
legend("topleft",legend=unique(NY$cab_type), 
       col=unique(as.numeric(NY$cab_type)), pch=19) 

Prunned tree

pruned.tree <- prune.tree(tree.2, best=15)
plot(pruned.tree, )
text(pruned.tree, cex=.6)
title(main='Prunned best tree')
pruned.pre <- predict(pruned.tree, testny, type="class")
table(pruned.pre, testny$ub)

Here’s the plot for our prunned tree: alt

Here’s the partition plot for our prunned tree: alt

This package can also do K-fold cross-validation using cv.tree() to find the best tree: Here, let’s use all the variables and all the samples.

cv.model <- cv.tree(tree.2)
plot(cv.model)
title(main='CV best tree')

cv.model$dev
best.size <- cv.model$size[which(cv.model$dev==min(cv.model$dev))] 
best.size

cv.pruned <- prune.misclass(tree.2, best=35)
summary(cv.pruned)

predtcv <- predict(cv.pruned, testny)
head(predtcv)

idx <- apply(predtcv, c(1), maxidx)
prediction <- c('Lyft', 'Uber')[idx]
table(prediction, testny$cab_type)

Predictions and accuracy

trpred = predict(tree.2, trainny, type = "class")
tspred = predict(tree.2, testny, type = "class")

table(predicted = trpred, actual = trainny$ub)
table(predicted = tspred, actual = testny$ub)

accuracy = function(actual, predicted) {
    mean(actual == predicted)
}

Train accuracy

accuracy(predicted = trpred, actual = trainny$ub)

Test accuracy

accuracy(predicted = tspred, actual = testny$ub)

Accuracy results

	Train	Test
Stylized tree	0.7860975	0.7865336
Prunned tree	0.8778819	0.8764496

It is easy to see that the tree has been over-fit, and the test set performs slighlty better than the train set.

Cross Validation

We will now use cross-validation to find a tree by considering trees of different sizes which have been pruned from our original tree.

set.seed(45)
NYtree_cv = cv.tree(tree.2, FUN = prune.misclass)
plot(NYtree_cv)

Index of tree with minimum error

best.size2 <- NYtree_cv$size[which(NYtree_cv$dev==min(NYtree_cv$dev))] 
best.size2

Misclassification rate of each tree

NYtree_cv$dev/length(idx)
plot(NYtree_cv$size, NYtree_cv$dev/nrow(trainny), type = "b",
     xlab = "Tree Size", ylab = "CV Misclassification Rate")

Pruned tree

Train dataset

NYprune_trn = predict(pruned.tree, trainny, type = "class")
table(predicted = NYprune_trn, actual = trainny$ub)
accuracy(predicted = NYprune_trn, actual = trainny$ub)

Test dataset

NYprune_tst = predict(pruned.tree, testny, type = "class")
table(predicted = NYprune_tst, actual = testny$ub)
accuracy(predicted = NYprune_tst, actual = testny$ub)

The train set has performed almost as well as before, and there was a small improvement in the test set, but it is still obvious that we have over-fit.

Trees tend to do this. We will look at several ways to fix this, including: bagging, boosting and random forests, in the next posts.

This is the end of Part 3.

Twitter Facebook LinkedIn