abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")34 Random Forest Uygulaması
Mikrobiyal çokluk verisini yükleyelim:
Metadata bilgisini yükleyelim:
metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)Elimizdeki veriyi transpoze edelim:
r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)Özet istatistiklerine bakalım:
model
Call:
randomForest(formula = Condition ~ ., data = r1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 15
OOB estimate of error rate: 40.91%
Confusion matrix:
Healthy Periodontitis class.error
Healthy 9 3 0.25
Periodontitis 6 4 0.60
Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:
imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10) Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus 3.9596713 3.21887735 4.0149134
Parvimonas_micra 1.6805118 1.10086473 1.6059380
Treponema_lecithinolyticum 1.6335894 2.29101973 2.4244474
Actinomyces_johnsonii 2.5660692 2.27558381 2.6919707
Actinomyces_dentalis 1.6227818 2.13832473 2.2133062
Treponema_denticola 0.5956181 -1.39162329 -0.6612598
Slackia_exigua 1.8225554 2.36546059 2.2460879
Streptococcus_mitis -0.6021475 -0.35297892 -0.1139190
Tannerella_forsythia 1.3779833 1.12633652 1.4580328
Streptococcus_anginosus 1.8177815 0.05263172 1.2036592
MeanDecreaseGini
Dialister_micraerophilus 0.3645126
Parvimonas_micra 0.2496278
Treponema_lecithinolyticum 0.2418009
Actinomyces_johnsonii 0.2357353
Actinomyces_dentalis 0.2303441
Treponema_denticola 0.2259190
Slackia_exigua 0.2204276
Streptococcus_mitis 0.1711704
Tannerella_forsythia 0.1612940
Streptococcus_anginosus 0.1611792
Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.
Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.
library(magrittr)
library(dplyr)
library(reshape2)
data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]
N<-ncol(data_o)-6
for (i in 0:N){
if (i == 0){
cat("Recursive feature elemination script has started...\n")
cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
turn <- paste0("turn", i)
importance <- c()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
}
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
if (i > 0) {
turn <- paste0("turn", i)
importance <- c()
cat("Turn:", i, " ")
data_o <- data_o[,-which(colnames(data_o) %in% least)]
cat("We eleminated the least informative feature:", least, " ")
cat("Remaining species:", (ncol(data_o)-1), "\n")
models[[i]] <- list()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
models[[i]][[j]]<-model
sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
}
}
least_old <- least
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
}
errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])
errors_melt <- melt(errors)
colnames(errors_melt)<-c("Turn", "Type", "Value")
sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")