# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# See the copyright notice at the end of this file.
#
# ====================================================================


# these functions are used to fit the random forest-models

# functions included in this file
# 
#     - FitRF
#     - FitRF_forCVs
#     - FitRF_forProjections

FitRF <- function(Training_data){
  ### Fit Random Forest model
  # Ntree: Number of trees to grow. 
  # This should not be set to too small a number, 
  # to ensure that every input row gets predicted at least a few times
  
  # Mtry: Number of variables randomly sampled as candidates at each split. 
  # Note that the default values are different for classification (sqrt(p) 
  # where p is number of variables in x) and regression (p/3)
  set.seed(3457)
  RFmodel <- rfsrc(HabitatNames ~ ., 
                 data = Training_data, 
                 ntree = 100)
  
  return(RFmodel)
} 

FitRF_forCVs <- function(dt_Observations_forRF){
  
  # make a list of all habitat names
  HabitatNames <- sort(unique(dt_Observations_forRF$HabitatNames))

  # load the table with all characteristic species per habitat
  dt_charact_species <- fread(species_special)
  
  # per habitat fit and evaluate RF model
  for(HabitatName in HabitatNames){
    WriteLogFile(ln= paste0("1. Start fitting CV-RF for ", HabitatName),
                 file.path(user_dir,base_dir,species_out_dir,LogFile))
    
    # select only the characteristic species from the input table and binarize the column with the habitat names
    dt_Habitat <- Select_CharacterSpecies(dt_charact_species, 
                                         HabitatName = HabitatName, 
                                         dt_Input_forRF = dt_Observations_forRF)
    
    # only continue cross validation if the function return any characteristic species 
    if(ncol(dt_Habitat) > 1){
       # divide all vegetation plots into 5 groups for cross validation
       set.seed(123)
       CVnr <- as.vector(sample(c(1:5), nrow(dt_Habitat), replace = TRUE))
       Rownr <- as.vector(c(1:nrow(dt_Habitat)))
       CVpool <- as.data.table(cbind(Rownr,CVnr))
       
       # Start cross validation
       for( i in c(1:5)) {
         # make training and validation set for the observations
         Training_data <- dt_Habitat[CVpool[CVnr != i,Rownr],]
         Validation_data <- dt_Habitat[CVpool[CVnr == i,Rownr],]
         
         ### Fit Random Forest model
         RF_cv <- FitRF(Training_data)
         
         # predict random forest model on validation data
         predicted_RFcv <- Project_RF(RF.model = RF_cv, 
                                      Projection_data = Validation_data)
         
         # evalauate predictions with MCC-value per habitat
         dt_AUC.cv <- Evaluate_RFmodel(HabitatName = HabitatName,
                                       RF_Pred_PoOs = predicted_RFcv$RF_Pred_PoOs,
                                       RF_Obs_Habitats = predicted_RFcv$RF_Obs_Habitats)
         
         if(i == 1){
           DT <- dt_AUC.cv
         } else {
           DT <- cbind(DT,dt_AUC.cv$AUC)
         }
       }
       
     } else {
       DT <- as.data.table(matrix(data = c(HabitatName,rep(NA,5)), nrow = 1, ncol = 6))
     }

     # calculate mean AUC-value per habitat
     colnames(DT) <- c("HabitatNames", paste0("AUC_cv",1:5))
     DT <- DT[ ,lapply(.SD,as.numeric), by = "HabitatNames" ]
     DT[, MeanAUC := rowMeans(.SD), .SDcols = paste0("AUC_cv",1:5)]
      
     if(HabitatName == HabitatNames[1]){
       dt_AllAUCs <- DT
     } else {
       dt_AllAUCs <- rbind(dt_AllAUCs,DT)
     }
  }
    
  # write table with MCC-values to hard disk
  write.csv(dt_AllAUCs, file.path(user_dir, base_dir, species_out_dir, "Habitats_CV_AUC.csv"))
  
  return(dt_AllAUCs)
}

FitRF_forProjections <- function(dt_Observations_forRF, HabitatName){
  
  # per habitat fit RF model on all data
  WriteLogFile(ln= paste0("3. Start fitting RF with 100% of the data for ", HabitatName),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
    
  # select only the characteristic species from the input table and binarize the column with the habitat names
  dt_Habitat <- Select_CharacterSpecies(dt_charact_species, 
                                        HabitatName = HabitatName, 
                                        dt_Input_forRF = dt_Observations_forRF)
    
  ### Fit Random Forest model
  RFmodel <- FitRF(dt_Habitat)
  
  return(RFmodel)
}

Select_CharacterSpecies <- function(dt_charact_species, HabitatName, dt_Input_forRF){
  # select only the rows on the focal habitat
  select_species <- unique(dt_charact_species[`Habitat code` == HabitatName,Species ])
  select_species <- FormatSpeciesNames_forRF(select_species)
  # remove species that are not in the input table for fitting RF
  all_species_withSDMs <- names(dt_Input_forRF)
  select_species <- select_species[select_species %in% all_species_withSDMs]
  # select only the columns in the data with characteristic species
  if("HabitatNames" %in% names(dt_Input_forRF)){
    select_columns <- c("HabitatNames",select_species)
    dt_Habitat <- dt_Input_forRF[,..select_columns]
    
    # binarize the column with the habitat names
    Habtitat_binarized <- as.numeric(dt_Habitat$HabitatNames == HabitatName)
    dt_Habitat <- dt_Habitat[,HabitatNames := Habtitat_binarized ]
  } else {
    dt_Habitat <- dt_Input_forRF[,..select_species]
  }
  
  return(dt_Habitat[])
}





# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# 
# This source code of the BioScore model is owned by PBL Netherlands Environmental Assessment Agency. 
# It is not permitted to copy, redistribute, remix, transform, and build upon the material without written approval of PBL. 
# Permission for commercial purposes will not be granted. 
# This code is published to improve the transparency of the models used by PBL, 
# but without any warranty for fitness for any other purpose. 
# After approval of PBL to use the code, PBL will not provide any support.
#
# 
# ====================================================================

