31  Random Forest Uygulaması

Mikrobiyal çokluk verisini yükleyelim:

abundance <- read.table(file = "../data/microbial-abundance.txt", header = T, sep = "\t")

Metadata bilgisini yükleyelim:

metadata <- read.table(file = "../data/microbial-metadata.txt", header = T, stringsAsFactors = FALSE)

Elimizdeki veriyi transpoze edelim:

r1<-data.frame(t(abundance))
r1$Condition<-as.factor(metadata$condition)

Şimdi random forest modelini kuralım. Bunun için elimizdeki bütün verileri kullanalım öncelikle:

randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
model <- randomForest(Condition ~ ., data = r1, importance = TRUE)

Özet istatistiklerine bakalım:

model

Call:
 randomForest(formula = Condition ~ ., data = r1, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 15

        OOB estimate of  error rate: 36.36%
Confusion matrix:
              Healthy Periodontitis class.error
Healthy             9             3        0.25
Periodontitis       5             5        0.50

Peki en önemli bakteriler nedir? En önemli 10 bakteriye bakalım:

imp <- data.frame(importance(model))
head(imp[order(imp$MeanDecreaseGini, decreasing = T),],n = 10)
                                   Healthy Periodontitis MeanDecreaseAccuracy
Dialister_micraerophilus        3.70015760    3.57878256            4.1884250
Actinomyces_johnsonii           2.41579532    2.10713712            2.6916909
Actinomyces_timonensis          2.60876910    2.26147786            2.8083695
Actinomyces_dentalis            0.00000000   -0.41626403           -0.9877713
Streptococcus_mitis             0.04085896    1.85013207            0.9858540
Prevotella_nanceiensis          1.18411623    0.05123169            0.6534588
Eggerthia_catenaformis          1.54157103    2.19858068            2.3690041
Actinomyces_israelii            1.83995656    1.91673567            2.1466009
Dialister_invisus               1.47394905    0.79511951            1.4397188
Pseudoramibacter_alactolyticus -1.17775662    0.46862417           -1.0705513
                               MeanDecreaseGini
Dialister_micraerophilus              0.3696511
Actinomyces_johnsonii                 0.3496876
Actinomyces_timonensis                0.2643389
Actinomyces_dentalis                  0.2409224
Streptococcus_mitis                   0.2060490
Prevotella_nanceiensis                0.1852884
Eggerthia_catenaformis                0.1712090
Actinomyces_israelii                  0.1704371
Dialister_invisus                     0.1700341
Pseudoramibacter_alactolyticus        0.1699905

Biz ise iterative feature elimination isminde bir metod kullandık. Her adımda, modelin başarısına en az katkı sunan bakteriyi veri setinden çıkarttık.

Bu kod uzun sürdüğü için internet sitesi kapsamında çalıştırmadık.

library(magrittr)
library(dplyr)
library(reshape2)

data_o<-r1
data_o$Condition<-r1$Condition
errors <- c()
sig_models <- c()
models <- list()
#data_o<-r1[1:22, 1:50]


N<-ncol(data_o)-6

for (i in 0:N){
  if (i == 0){
    cat("Recursive feature elemination script has started...\n")
    cat("Turn:", i, "Remaining species:", (ncol(data_o)-1), "\n")
    turn <- paste0("turn", i)
    importance <- c()
    for (j in 1:10){
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
    }
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
  }
  
  if (i > 0) {
    turn <- paste0("turn", i)
    importance <- c()
    cat("Turn:", i, " ")
    data_o <- data_o[,-which(colnames(data_o) %in% least)]
    cat("We eleminated the least informative feature:", least, " ")
    cat("Remaining species:", (ncol(data_o)-1), "\n")
    models[[i]] <- list()
    for (j in 1:10){
      
      model <- randomForest(Condition ~ ., data = data_o, importance = TRUE)
      
      imp<-data.frame(model$importance)
      imp$Feat<-rownames(imp)
      importance <- rbind(importance, imp)
      errors<-rbind(errors, c(turn,model$err.rate[nrow(model$err.rate),]))
      
      if (model$err.rate[nrow(model$err.rate),1] < 0.25) {
        models[[i]][[j]]<-model
        sig_models <- rbind(sig_models, c(i, j, model$err.rate[nrow(model$err.rate),1]))
      }
    }
    least_old <- least
    
    a<-importance%>%group_by(Feat) %>% select(MeanDecreaseGini) %>% summarise(MDG=mean(MeanDecreaseGini))
    least <- as.character(head(a[order(a$MDG),],n=1)[,1])
    
  }
}

errors <- data.frame(errors, stringsAsFactors = F)
colnames(errors)[1] <- "Turn"
errors[,2] <- as.numeric(errors[,2])
errors[,3] <- as.numeric(errors[,3])
errors[,4] <- as.numeric(errors[,4])

errors_melt <- melt(errors)

colnames(errors_melt)<-c("Turn", "Type", "Value")

sig_models<-data.frame(sig_models, stringsAsFactors = F)
colnames(sig_models) <- c("Turn", "Try", "OOB")