31  Random Forest Uygulaması

Mikrobiyal çokluk verisini yükleyelim:

abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")

Metadata bilgisini yükleyelim:

metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)

Elimizdeki veriyi transpoze edelim:

r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)

Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:

randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)

Özet istatistiklerine bakalım:

model

Call:
 randomForest(formula = Condition ~ ., data = r1, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 15

        OOB estimate of  error rate: 45.45%
Confusion matrix:
              Healthy Periodontitis class.error
Healthy             9             3        0.25
Periodontitis       7             3        0.70

Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:

imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10)
                              Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus    3.9312606     3.4464722           4.27673784
Actinomyces_johnsonii       1.4664294     1.0829525           1.33433718
Actinomyces_timonensis      2.4647855     2.1019663           2.49307424
Actinomyces_dentalis       -0.6642363    -0.7074606          -0.48203301
Treponema_socranskii        1.8602869     1.1783295           1.84332194
Tannerella_forsythia        1.8016563     0.8895730           1.68904143
Parvimonas_micra            0.1040162    -0.3266335           0.10381890
Slackia_exigua              1.6260685     1.8828667           2.01030885
Treponema_lecithinolyticum  1.3144425    -1.2989525           0.23605700
Prevotella_nanceiensis     -0.8013031     1.3770905           0.09568491
                           MeanDecreaseGini
Dialister_micraerophilus          0.4228851
Actinomyces_johnsonii             0.3672267
Actinomyces_timonensis            0.2972309
Actinomyces_dentalis              0.2385421
Treponema_socranskii              0.2143127
Tannerella_forsythia              0.2121673
Parvimonas_micra                  0.2051567
Slackia_exigua                    0.1797016
Treponema_lecithinolyticum        0.1731005
Prevotella_nanceiensis            0.1619972

Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.

Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.

library(magrittr)
library(dplyr)
library(reshape2)

data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]


N<-ncol(data_o)-6

for (i in 0:N){
  if (i == 0){
    cat("Recursive feature elemination script has started...\n")
    cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
    turn <- paste0("turn", i)
    importance <- c()
    for (j in 1:10){
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
    }
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
  }
  
  if (i > 0) {
    turn <- paste0("turn", i)
    importance <- c()
    cat("Turn:", i, " ")
    data_o <- data_o[,-which(colnames(data_o) %in% least)]
    cat("We eleminated the least informative feature:", least, " ")
    cat("Remaining species:", (ncol(data_o)-1), "\n")
    models[[i]] <- list()
    for (j in 1:10){
      
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
      
      if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
        models[[i]][[j]]<-model
        sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
      }
    }
    least_old <- least
    
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
    
  }
}

errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])

errors_melt <- melt(errors)

colnames(errors_melt)<-c("Turn", "Type", "Value")

sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")