abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")34 Random Forest Uygulaması
Mikrobiyal çokluk verisini yükleyelim:
Metadata bilgisini yükleyelim:
metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)Elimizdeki veriyi transpoze edelim:
r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)Özet istatistiklerine bakalım:
model
Call:
randomForest(formula = Condition ~ ., data = r1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 15
OOB estimate of error rate: 40.91%
Confusion matrix:
Healthy Periodontitis class.error
Healthy 9 3 0.25
Periodontitis 6 4 0.60
Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:
imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10) Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus 4.3466915 4.1407390 4.6066539
Actinomyces_johnsonii 1.2312389 1.9575961 1.6794823
Actinomyces_timonensis 2.6943878 3.0411191 3.2013352
Tannerella_forsythia 2.0711635 0.4684896 1.2956422
Lactobacillus_salivarius -0.1426819 -0.1723294 -0.3063354
Parvimonas_micra 0.5751791 0.2015821 0.5976765
Actinomyces_dentalis 0.8430825 1.4672665 1.1154979
Treponema_lecithinolyticum 1.5115115 1.5536373 1.8212572
Capnocytophaga_granulosa 1.7829903 1.5892571 1.9408092
Treponema_socranskii 3.0277679 2.7874773 3.0805485
MeanDecreaseGini
Dialister_micraerophilus 0.4281339
Actinomyces_johnsonii 0.3223357
Actinomyces_timonensis 0.2064536
Tannerella_forsythia 0.2049289
Lactobacillus_salivarius 0.1986888
Parvimonas_micra 0.1982667
Actinomyces_dentalis 0.1826756
Treponema_lecithinolyticum 0.1825805
Capnocytophaga_granulosa 0.1730525
Treponema_socranskii 0.1673328
Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.
Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.
library(magrittr)
library(dplyr)
library(reshape2)
data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]
N<-ncol(data_o)-6
for (i in 0:N){
if (i == 0){
cat("Recursive feature elemination script has started...\n")
cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
turn <- paste0("turn", i)
importance <- c()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
}
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
if (i > 0) {
turn <- paste0("turn", i)
importance <- c()
cat("Turn:", i, " ")
data_o <- data_o[,-which(colnames(data_o) %in% least)]
cat("We eleminated the least informative feature:", least, " ")
cat("Remaining species:", (ncol(data_o)-1), "\n")
models[[i]] <- list()
for (j in 1:10){
model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
imp<-data.frame(model$importance)
imp$Feat<-rownames(imp)
importance <- rbind(importance, imp)
errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
models[[i]][[j]]<-model
sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
}
}
least_old <- least
a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
least <- as.character(head(a[order(a$MDG),],n=1)[,1])
}
}
errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])
errors_melt <- melt(errors)
colnames(errors_melt)<-c("Turn", "Type", "Value")
sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")