abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")
32 Random Forest Uygulaması
Mikrobiyal çokluk verisini yükleyelim:
Metadata bilgisini yükleyelim:
metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)
Elimizdeki veriyi transpoze edelim:
r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)
Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)
Özet istatistiklerine bakalım:
model
Call:
randomForest(formula = Condition ~ ., data = r1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 15
OOB estimate of error rate: 36.36%
Confusion matrix:
Healthy Periodontitis class.error
Healthy 10 2 0.1666667
Periodontitis 6 4 0.6000000
Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:
imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10)
Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus 4.6946689 4.3342388 5.0134534
Actinomyces_johnsonii 2.4312450 0.8664638 1.9057908
Actinomyces_timonensis 3.3025388 3.4286476 3.6188451
Parvimonas_micra 1.2218737 2.0156952 2.1235188
Tannerella_forsythia 1.8825255 1.2371834 1.3307104
Actinomyces_dentalis 1.1519809 2.2856582 2.4426027
Actinomyces_israelii 1.5177937 -0.1825803 0.6022520
Slackia_exigua 0.6078062 0.7221222 0.5284703
Treponema_socranskii -0.4628694 -1.9205196 -1.4391476
Capnocytophaga_granulosa 0.8147541 2.1557336 1.6549543
MeanDecreaseGini
Dialister_micraerophilus 0.4743429
Actinomyces_johnsonii 0.3727650
Actinomyces_timonensis 0.2852672
Parvimonas_micra 0.2818522
Tannerella_forsythia 0.1950669
Actinomyces_dentalis 0.1948322
Actinomyces_israelii 0.1869708
Slackia_exigua 0.1711312
Treponema_socranskii 0.1700700
Capnocytophaga_granulosa 0.1634297
Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.
Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.
library(magrittr)
library(dplyr)
library(reshape2)
data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]
N<-ncol(data_o)-6
for (i in 0:N){
if (i == 0){
cat("Recursive feature elemination script has started...\n")
cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
turn <- paste0("turn", i)
importance <- c()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
}
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
if (i > 0) {
turn <- paste0("turn", i)
importance <- c()
cat("Turn:", i, " ")
data_o <- data_o[,-which(colnames(data_o) %in% least)]
cat("We eleminated the least informative feature:", least, " ")
cat("Remaining species:", (ncol(data_o)-1), "\n")
models[[i]] <- list()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
models[[i]][[j]]<-model
sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
}
}
least_old <- least
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
}
errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])
errors_melt <- melt(errors)
colnames(errors_melt)<-c("Turn", "Type", "Value")
sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")