Feature Importance on MNIST with feature counts

Select a subset of 3’s and 5’s from MNIST

Here we train a forest using RerF with the RandMatImagePatch option with patch.min = 1 and patch.max = 5.

## Get a random subsample, 100 each of 3's and 5's
threes <- which(mnist$Ytrain %in% 3)
fives  <- which(mnist$Ytrain %in% 5)
numsub <- c(threes, fives)

Ytrain <- mnist$Ytrain[numsub]
Xtrain <- mnist$Xtrain[numsub,]
Ytest <- mnist$Ytest[mnist$Ytest %in% c(3,5)]
Xtest <- mnist$Xtest[mnist$Ytest %in% c(3,5),]
# p is number of dimensions, d is the number of random features to evaluate, iw is image width, ih is image height, patch.min is min width of square patch to sample pixels from, and patch.max is the max width of square patch
p <- ncol(Xtrain)
d <- ceiling(sqrt(p))
iw <- sqrt(p)
ih <- iw
patch.min <- 1L
patch.max <- 5L

forestS <- RerF(Xtrain, Ytrain, num.cores = numCores, FUN = RandMatImagePatch,
               paramList = list(p = p, d = d, iw = iw, ih = ih,
                                pwMin = patch.min, pwMax = patch.max), max.depth = 8)

predS <- Predict(Xtest, forestS, num.cores = numCores)
(mnist.error.rate <- mean(predS != Ytest))
## [1] 0.01892744
forestRerF <- RerF(Xtrain, Ytrain, num.cores = numCores, FUN = RandMatBinary,
               paramList = list(p = p, d = d), max.depth = 8)


predRerF <- Predict(Xtest, forestRerF, num.cores = numCores)
(mnist.error.rate <- mean(predRerF != Ytest))
## [1] 0.02208202
forestRF <- RerF(Xtrain, Ytrain, num.cores = numCores, FUN = RandMatRF,
               paramList = list(p = p, d = d), max.depth = 8)


predRF <- Predict(Xtest, forestRF, num.cores = numCores)
(mnist.error.rate <- mean(predRF != Ytest))
## [1] 0.02050473
system.time({
FeatImpS <- FeatureImportance(forestS, num.cores = numCores, type = "C")
})
## Message: Computing feature importance as counts of unique feature combinations.

##     user   system  elapsed
## 1229.062    6.560  314.241
ni <- matrix(0, length(FeatImpS$features), ncol(Xtrain))

for(i in 1:length(FeatImpS$features)){
  ni[i, FeatImpS$features[[i]]] <- FeatImpS$imp[i]
}

NN <- matrix(apply(ni, 2, sum) / nrow(ni), 28, 28, byrow = FALSE)
saveRDS(NN, file = "NNsrerf.rds")

system.time({
FeatImpRerF <- FeatureImportance(forestRerF, num.cores = numCores, type = "C")
})
## Message: Computing feature importance as counts of unique feature combinations.

##     user   system  elapsed
## 1750.585   11.082  444.860
nir <- matrix(0, length(FeatImpRerF$features), ncol(Xtrain))

for(i in 1:length(FeatImpRerF$features)){
  nir[i, FeatImpRerF$features[[i]]] <- FeatImpRerF$imp[i]
}

NNr <- matrix(apply(nir, 2, sum) / nrow(nir), 28, 28, byrow = FALSE)
saveRDS(NNr, file = "NNrerf.rds")

system.time({
FeatImpRF <- FeatureImportance(forestRF, num.cores = numCores, type = "C")
})
## Message: Computing feature importance as counts of unique feature combinations.

##    user  system elapsed
##  14.579   0.323   5.549
nirf <- matrix(0, length(FeatImpRF$features), ncol(Xtrain))

for(i in 1:length(FeatImpRF$features)){
  nirf[i, FeatImpRF$features[[i]]] <- FeatImpRF$imp[i]
}

NNrf <- matrix(apply(nirf, 2, sum) / nrow(nirf), 28, 28, byrow = FALSE)
saveRDS(NNrf, file = "NNrf.rds")

plot the average 3 and 5 from the training set

the3s <- Xtrain[Ytrain == 3, ]
the5s <- Xtrain[Ytrain == 5, ]

sum3 <- matrix(apply(the3s, 2, sum) / sum(Ytrain == 3), 28, 28, byrow = FALSE)
sum5 <- matrix(apply(the5s, 2, sum) / sum(Ytrain == 5), 28, 28, byrow = FALSE)

g <- expand.grid(x = 1:28, y = 1:28)
gg <- rbind(g, g)

nn <- as.vector(t(NN[28:1, ]))
nnrerf <- as.vector(t(NNr[28:1, ]))
nnrf <- as.vector(t(NNrf[28:1, ]))

s3 <- as.vector(t(sum3[28:1,]))
s5 <- as.vector(t(sum5[28:1,]))
s3m5 <- abs(s3 - s5)

Z <- data.frame(g, weight = c(nn, nnrerf, nnrf, s3, s5, s3m5), Alg =  rep(c("MF", "Sporf", "RF", "Average 3", "Average 5", "x3m5"), each = length(nn)))

sc0 <- scale_fill_gradientn(colours = viridis(255))
sc1 <- scale_fill_gradientn(colours = inferno(255))

a1 <- ggplot(data = Z[ Z$Alg == "Average 3",    ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + guides(fill = FALSE) + sc1 + ggtitle("Average 3")
a2 <- ggplot(data = Z[ Z$Alg == "Average 5", ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + guides(fill = FALSE) + sc1 + ggtitle("Average 5")
a3 <- ggplot(data = Z[ Z$Alg == "x3m5", ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + guides(fill = FALSE) + sc1 + ggtitle("abs(avg(3) - avg(5))")

grid.arrange(a1, a2, a3, ncol=3)

image0

#ggslackr(grid.arrange(a1, a2, a3, ncol=3), channels="#manifold-forest")

Feature heatmap

These are the features that S-RerF and RerF used, plotted as averaged heatmaps.

p1 <- ggplot(data = Z[ Z$Alg == "MF",    ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + sc1 + ggtitle("MF")
p2 <- ggplot(data = Z[ Z$Alg == "Sporf", ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + sc1 + ggtitle("Sporf")
p3 <- ggplot(data = Z[ Z$Alg == "RF",    ], aes(x = x, y = y, fill = weight)) + geom_raster() + theme_void() + sc1 + ggtitle("RF")

grid.arrange(p1, p2, p3, ncol=3)

image1