# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# See the copyright notice at the end of this file.
#
# ====================================================================


# these functions are used to evaluate the random forest-models

# functions included in this file
# 
#     - Evaluate_CV_RF
#     - Evaluate_RFpredictions


Evaluate_RFmodel <- function(HabitatName,RF_Pred_PoOs,RF_Obs_Habitats){
  # predicted values
  Fit <- RF_Pred_PoOs
  # observed values
  Obs <- RF_Obs_Habitats
  
  # if there are no presences in the test set, the AUC can't be calculated
  if(sum(Obs) == 0){
    AUC.RF <- NA
  } else{
    # calculate AUC
    AUC.stats <- FindOptimStat(metric.eval='ROC',fit=Fit, obs=Obs, nb.thresh = 100)
    AUC.RF <- AUC.stats$best.stat
  }
    
  # save AUC-value in table
  dt_AUC <-  matrix(c(as.character(HabitatName),round(AUC.RF, digits = 5)),
                         nrow = 1, ncol=2,
                         dimnames = list(NULL, c("HabitatNames","AUC")))
  
  return(as.data.table(dt_AUC))
  }
  

Evaluate_RFpredictions <- function(PlotLocations){
  # Calculate optimal threshold value per habitat and calculate MCC of habitat range map.
  WriteLogFile(ln= paste0("6. Start calculating the AUC values and cut off values for the predicted habitat ranges"),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  
  # first make a further selection of vegetation plots used to validate the habitat maps
  
  # load raw data file with all plot locations
  PlotLocations_rawdata <- fread(plots, quote="", sep = "\t")  
  
  # Plots measured between 1990 and 2020 are selected.
  Year_of_recording <- as.numeric(substr(PlotLocations_rawdata$`Date of recording`,7,10))
  PlotLocations_rawdata <- PlotLocations_rawdata[, Year := Year_of_recording]
  PlotLocations_rawdata <- PlotLocations_rawdata[Year >= MinYear]
  # Plots with a known location uncertainty larger than 1000m are removed
  PlotLocations_rawdata <- PlotLocations_rawdata[`Location uncertainty (m)` <= 1000 | is.na(`Location uncertainty (m)`)]
  
  # select only those plots which are used to fit RF-model which also meet the above criteria
  PlotSelection <- unique(PlotLocations_rawdata$PlotID)
  PlotLocations <- PlotLocations[PlotID %in% PlotSelection]
  
  # define the x and y coordinates of the plots
  spdf_PlotLocations <- SpatialPointsDataFrame( PlotLocations[,c("x","y")], 
                                                PlotLocations[,c("PlotID","HabitatNames")],
                                                proj4string=CRS("+proj=longlat +datum=WGS84"))
  
  # change coordinate system to overlap with species range maps
  spdf_PlotLocations <- spTransform(spdf_PlotLocations, crs("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs +type=crs"))
  
  # save table with plots to hard disk
#  dt_PlotLocations <- as.data.table(spdf_PlotLocations)
#  colnames(dt_PlotLocations) <- c("PlotID", "HabitatNames", "x", "y")
#  write.csv(dt_PlotLocations, file.path(user_dir, base_dir, species_out_dir, "Plots_to evaluate_Habitats_predictions.csv"))
  
  ### Start snowfall
  sfInit(parallel = TRUE, cpus =  min(10, parallel::detectCores()))
  
  # Export packages
  sfLibrary('data.table', character.only = TRUE)
  sfLibrary('raster', character.only = TRUE)
  sfLibrary('terra', character.only = TRUE)
  sfLibrary('pROC', character.only = TRUE)
  
  # Export global variables
  sfExport("spdf_PlotLocations")
  sfExport("FindOptimStat")
  sfExport("bm_FindOptimStat.check.args")
  sfExport("fun_testIf01")
  sfExport("fun_testIfPosNum")
  sfExport("guess_scale")
  sfExport("bm_CalculateStat")
  sfExport("contingency_table_check")
  sfExport("get_optim_value")
  sfExport("PathScenario_IntRes")
  sfExport("WriteLogFile")
  sfExport("user_dir")
  sfExport("base_dir")
  sfExport("species_out_dir")
  sfExport("LogFile")
  
  dt_MCC_SDMs <- sfClusterApplyLB(HabitatNames, function(HabitatName){
    
    WriteLogFile(ln= paste0("6. start ", HabitatName),
                 file.path(user_dir,base_dir,species_out_dir,LogFile))
    
    
    # make a raster out of the habitat prediction
    # first read table from hard disk.
    predicted_RF <- fread(file.path(PathScenario_IntRes,paste0(HabitatName,".csv")))   
    HabitatPrediction <- cbind(predicted_RF[,.(x,y)], predicted_RF[,.(predicted_RF)])
    coordinates(HabitatPrediction) <- ~x+y
    gridded(HabitatPrediction) <- TRUE
    r_HabitatPrediction <- raster::raster(HabitatPrediction)
    
    # extract the observed habitats from the vegetation plots and combine it with the cell values
    Habitat_OBSandPRED <- extract(r_HabitatPrediction, spdf_PlotLocations, method = 'simple', sp = TRUE)
    # clean the table
    Habitat_OBSandPRED <- as.data.table(Habitat_OBSandPRED) 
    Habitat_OBSandPRED <- na.omit(Habitat_OBSandPRED)
    
    # add a unique code to each 1-km grid cell
    # this code is used to randomly select plots per grid cell
    x_round <- round((Habitat_OBSandPRED$coords.x1 -500)/1000,0)
    y_round <- round((Habitat_OBSandPRED$coords.x2-500)/1000,0)
    GridNr <- paste0(x_round,y_round)
    Habitat_OBSandPRED[,GridNr := GridNr]
    
    # select randomly one presence and absence per 1-km grid cell
    # first randomly sort observation table
    set.seed(99) #set seed 
    Habitat_OBSandPRED <- Habitat_OBSandPRED[sample(nrow(Habitat_OBSandPRED)),]
    # make a table with only the presences
    Presences <- Habitat_OBSandPRED[HabitatNames == HabitatName]
    # make a list of all grid cell numbers where the habitat is present
    GridNrList <- as.character(Presences[,GridNr])
    # make a list with all unique grid cell numbers
    GridUnique <- unique(GridNrList)
    # Note the number of the row with the first found grid cell number in it listed in 'GridUnique'.
    Rownr_Presences <- match(GridUnique,GridNrList)
    # select only the randomly selected grid cells from the table
    Presences <- Presences[Rownr_Presences]
    
    # make a table with only the absences
    Absences <- Habitat_OBSandPRED[HabitatNames != HabitatName]
    # make a list of all grid cell numbers where the habitat is absent
    GridNrList <- as.character(Absences[,GridNr])
    # make a list with all unique grid cell numbers
    GridUnique <- unique(GridNrList)
    # Note the number of the row with the first found grid cell number in it listed in 'GridUnique'.
    Rownr_Absence <- match(GridUnique,GridNrList)
    # select only the randomly selected grid cells from the table
    Absences <- Absences[Rownr_Absence]

    # merge the tables with the selected presences and absences
    Habitat_OBSandPRED <- rbind(Presences,Absences)
    
    # calculate the prevalence of the habitat types within the study area
    # count nr of presences
    NPres <- sum(as.numeric(Habitat_OBSandPRED$HabitatNames == HabitatName))
    # count nr of km grid cells with vegetation plots
    NrOfObsCells <- length(unique(Habitat_OBSandPRED$GridNr))
    # calculate species prevalence
    Prev <- NPres/NrOfObsCells 
    
    
    # predicted values
    Fit <- unlist(Habitat_OBSandPRED[,c(predicted_RF)] )
    #summary(Fit)
    # observed values
    Obs <- as.numeric(Habitat_OBSandPRED$HabitatNames == HabitatName)
    #table(Obs)
    
    if(sum(Obs) == 0){
      MCC.cutoff <- NA
      AUC.prediction <- NA
    } else {
      # calculate cut off value by maximizing MCC as a threshold 
      MCC.stats <- FindOptimStat(metric.eval='MCC',fit=Fit, obs=Obs, nb.thresh = 100)
      MCC.cutoff <- MCC.stats$cutoff

      # Calculate the AUC of the binarized predictions
      AUC.prediction <- as.numeric(pROC::auc(Obs, Fit, direction = "<", levels = c(0, 1)))
    }
    
    # save MCC values and cutoff values in table
    dt_MCC <-  as.data.table(matrix(unlist(c(as.character(HabitatName),round(AUC.prediction, digits = 5),MCC.cutoff, round(Prev, digits = 6))), 
                                    nrow = 1, ncol=4,
                                    dimnames = list(NULL, c("HabitatNames","AUC","cutoff","Prevalence"))))
    
    return(dt_MCC)
  })
  
  ### Stop snowfall
  sfStop(nostop = FALSE)
  
  # write as a table to the hard drive
  dt_MCC_SDMs <- rbindlist(dt_MCC_SDMs)
  write.csv(dt_MCC_SDMs, file.path(user_dir, base_dir, species_out_dir, "Habitats_predictions_AUC.csv"))
  
  return(dt_MCC_SDMs)
} 




# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# 
# This source code of the BioScore model is owned by PBL Netherlands Environmental Assessment Agency. 
# It is not permitted to copy, redistribute, remix, transform, and build upon the material without written approval of PBL. 
# Permission for commercial purposes will not be granted. 
# This code is published to improve the transparency of the models used by PBL, 
# but without any warranty for fitness for any other purpose. 
# After approval of PBL to use the code, PBL will not provide any support.
#
# 
# ====================================================================

