31  Random Forest Uygulaması

Mikrobiyal çokluk verisini yükleyelim:

abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")

Metadata bilgisini yükleyelim:

metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)

Elimizdeki veriyi transpoze edelim:

r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)

Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:

randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)

Özet istatistiklerine bakalım:

model

Call:
 randomForest(formula = Condition ~ ., data = r1, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 15

        OOB estimate of  error rate: 36.36%
Confusion matrix:
              Healthy Periodontitis class.error
Healthy            10             2   0.1666667
Periodontitis       6             4   0.6000000

Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:

imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10)
                                 Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus       3.4605337    3.28077146           4.06264304
Tannerella_forsythia           2.4692701    2.11791090           2.60973740
Actinomyces_johnsonii          1.7998675    2.80138369           2.52749809
Parvimonas_micra               2.8123249    2.54198355           2.89894008
Actinomyces_timonensis         2.2986913    0.38336710           1.81612122
Slackia_exigua                 0.1764761   -0.58039447          -0.04380699
Prevotella_nanceiensis         0.7201146    0.06595427           0.62407498
Streptococcus_anginosus        0.4782008    1.18822502           0.77293453
Pseudoramibacter_alactolyticus 1.4388862   -0.30416411           1.12325286
Treponema_denticola            1.9880735    0.12846952           1.27244473
                               MeanDecreaseGini
Dialister_micraerophilus              0.3112896
Tannerella_forsythia                  0.2846642
Actinomyces_johnsonii                 0.2830902
Parvimonas_micra                      0.2665447
Actinomyces_timonensis                0.2018933
Slackia_exigua                        0.2018031
Prevotella_nanceiensis                0.1830533
Streptococcus_anginosus               0.1741785
Pseudoramibacter_alactolyticus        0.1607551
Treponema_denticola                   0.1493345

Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.

Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.

library(magrittr)
library(dplyr)
library(reshape2)

data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]


N<-ncol(data_o)-6

for (i in 0:N){
  if (i == 0){
    cat("Recursive feature elemination script has started...\n")
    cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
    turn <- paste0("turn", i)
    importance <- c()
    for (j in 1:10){
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
    }
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
  }
  
  if (i > 0) {
    turn <- paste0("turn", i)
    importance <- c()
    cat("Turn:", i, " ")
    data_o <- data_o[,-which(colnames(data_o) %in% least)]
    cat("We eleminated the least informative feature:", least, " ")
    cat("Remaining species:", (ncol(data_o)-1), "\n")
    models[[i]] <- list()
    for (j in 1:10){
      
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
      
      if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
        models[[i]][[j]]<-model
        sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
      }
    }
    least_old <- least
    
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
    
  }
}

errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])

errors_melt <- melt(errors)

colnames(errors_melt)<-c("Turn", "Type", "Value")

sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")