abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")
31 Random Forest Uygulaması
Mikrobiyal çokluk verisini yükleyelim:
Metadata bilgisini yükleyelim:
metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)
Elimizdeki veriyi transpoze edelim:
r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)
Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)
Özet istatistiklerine bakalım:
model
Call:
randomForest(formula = Condition ~ ., data = r1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 15
OOB estimate of error rate: 45.45%
Confusion matrix:
Healthy Periodontitis class.error
Healthy 9 3 0.25
Periodontitis 7 3 0.70
Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:
imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10)
Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus 3.9312606 3.4464722 4.27673784
Actinomyces_johnsonii 1.4664294 1.0829525 1.33433718
Actinomyces_timonensis 2.4647855 2.1019663 2.49307424
Actinomyces_dentalis -0.6642363 -0.7074606 -0.48203301
Treponema_socranskii 1.8602869 1.1783295 1.84332194
Tannerella_forsythia 1.8016563 0.8895730 1.68904143
Parvimonas_micra 0.1040162 -0.3266335 0.10381890
Slackia_exigua 1.6260685 1.8828667 2.01030885
Treponema_lecithinolyticum 1.3144425 -1.2989525 0.23605700
Prevotella_nanceiensis -0.8013031 1.3770905 0.09568491
MeanDecreaseGini
Dialister_micraerophilus 0.4228851
Actinomyces_johnsonii 0.3672267
Actinomyces_timonensis 0.2972309
Actinomyces_dentalis 0.2385421
Treponema_socranskii 0.2143127
Tannerella_forsythia 0.2121673
Parvimonas_micra 0.2051567
Slackia_exigua 0.1797016
Treponema_lecithinolyticum 0.1731005
Prevotella_nanceiensis 0.1619972
Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.
Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.
library(magrittr)
library(dplyr)
library(reshape2)
data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]
N<-ncol(data_o)-6
for (i in 0:N){
if (i == 0){
cat("Recursive feature elemination script has started...\n")
cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
turn <- paste0("turn", i)
importance <- c()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
}
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
if (i > 0) {
turn <- paste0("turn", i)
importance <- c()
cat("Turn:", i, " ")
data_o <- data_o[,-which(colnames(data_o) %in% least)]
cat("We eleminated the least informative feature:", least, " ")
cat("Remaining species:", (ncol(data_o)-1), "\n")
models[[i]] <- list()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
models[[i]][[j]]<-model
sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
}
}
least_old <- least
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
}
errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])
errors_melt <- melt(errors)
colnames(errors_melt)<-c("Turn", "Type", "Value")
sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")