abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")32 Random Forest Uygulaması
Mikrobiyal çokluk verisini yükleyelim:
Metadata bilgisini yükleyelim:
metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)Elimizdeki veriyi transpoze edelim:
r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)Özet istatistiklerine bakalım:
model
Call:
randomForest(formula = Condition ~ ., data = r1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 15
OOB estimate of error rate: 40.91%
Confusion matrix:
Healthy Periodontitis class.error
Healthy 9 3 0.25
Periodontitis 6 4 0.60
Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:
imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10) Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus 4.4566174 4.020227e+00 4.5167624
Actinomyces_johnsonii 2.3694607 1.320213e+00 2.2388322
Actinomyces_timonensis 2.6645869 7.065571e-01 2.0515524
Slackia_exigua 1.2096930 4.909659e-01 1.1620018
Actinomyces_dentalis 1.9540761 6.351029e-01 1.5830441
Treponema_denticola 1.0760778 6.804408e-01 0.8850252
Treponema_lecithinolyticum -0.4541508 -9.226195e-01 -0.4017545
Eggerthia_catenaformis 1.5561700 -1.342692e-16 1.4442876
Capnocytophaga_granulosa 1.2491638 1.187760e+00 1.3930281
Prevotella_nanceiensis -0.5129564 -2.142277e+00 -1.7873332
MeanDecreaseGini
Dialister_micraerophilus 0.3432882
Actinomyces_johnsonii 0.3306379
Actinomyces_timonensis 0.2622430
Slackia_exigua 0.2411580
Actinomyces_dentalis 0.1767030
Treponema_denticola 0.1624388
Treponema_lecithinolyticum 0.1599246
Eggerthia_catenaformis 0.1546082
Capnocytophaga_granulosa 0.1514691
Prevotella_nanceiensis 0.1507955
Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.
Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.
library(magrittr)
library(dplyr)
library(reshape2)
data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]
N<-ncol(data_o)-6
for (i in 0:N){
if (i == 0){
cat("Recursive feature elemination script has started...\n")
cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
turn <- paste0("turn", i)
importance <- c()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
}
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
if (i > 0) {
turn <- paste0("turn", i)
importance <- c()
cat("Turn:", i, " ")
data_o <- data_o[,-which(colnames(data_o) %in% least)]
cat("We eleminated the least informative feature:", least, " ")
cat("Remaining species:", (ncol(data_o)-1), "\n")
models[[i]] <- list()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
models[[i]][[j]]<-model
sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
}
}
least_old <- least
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
}
errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])
errors_melt <- melt(errors)
colnames(errors_melt)<-c("Turn", "Type", "Value")
sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")