# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# See the copyright notice at the end of this file.
#
# ====================================================================


# these functions are used to format observations of species to be used to fit the random forest-model

# functions included in this file
#     - SelectSpecies_forRF
#     - SelectPlots_forRF
#     - FormatObservations_forRF

SelectSpecies_forRF <- function(){
  # load species observation data
  system.time(dt_species_data <- fread(file.path(user_dir,base_dir, species_in_dir), quote = "", sep = "\t"))
  
  # select only species for which SDMs are going to be used in the projections
  SpeciesNames <- list.files(file.path(user_dir,base_dir,binarized_SDMs,"current"))
  SpeciesNames <- substr(SpeciesNames, 1, nchar(SpeciesNames) - 4)
  dt_species_data <- dt_species_data[`Matched concept` %in% SpeciesNames]
  # clean table
  dt_species_data <- dt_species_data[,.(PlotObservationID, `Matched concept`)]
  colnames(dt_species_data) <- c("PlotID", "SpeciesNames")
  
  dt_species_data <- unique(dt_species_data)
  
  WriteLogFile(ln= paste0(length(SpeciesNames), " species included in the analysis."),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  
  return(dt_species_data)
  
}

SelectPlots_forRF <- function(species_plots){
  
  
  WriteLogFile(ln="Start formatting species observations for fitting of random forest.",
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  
  # load file with all plot locations
  PlotLocations <- fread(file.path(user_dir, base_dir, plots), quote="", sep = "\t")  
  
  # remove "!" from the habitat names
  expert_system_new <- sub("!|!!", "", PlotLocations$`Expert System`)
  expert_system_new <- sub("!|!!", "", expert_system_new)
  PlotLocations[ ,`Expert System` := expert_system_new]
  
  ### dublicate plots assigned to multiple habitat types
  # note rows numbers which should be duplicated
  RowNr_to_duplicate <- grep(",",PlotLocations$`Expert System`)
  WriteLogFile(ln= paste0(length(RowNr_to_duplicate), " vegetation plots are dublicated, as multiple habitats are assigned."),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  
  # note the number of times they should be duplicated
  HTsToDuplicate <- strsplit(PlotLocations[RowNr_to_duplicate,`Expert System`], ",")
  NrOfDuplicates <- unlist(lapply(HTsToDuplicate,length))
  # dublicate rows
  dublicated_rownrs <- lapply(c(1:length(RowNr_to_duplicate)), FUN = function(x){
    rep(RowNr_to_duplicate[x], NrOfDuplicates[x])
  })
  DublicatedPlots <- PlotLocations[unlist(dublicated_rownrs)]
  ### change the habitat types of the duplicated table so that each row is assigned 1 habitat type
  DublicatedPlots[, `Expert System` := unlist(HTsToDuplicate)]
  
  # remove the old rows and replace them with the new rows
  PlotLocations <- PlotLocations[!c(1:nrow(PlotLocations)) %in% RowNr_to_duplicate]
  PlotLocations <- rbind(PlotLocations,DublicatedPlots)
  
  # remove plots which are located in maritime wetlands, fresh water, marine water, man-made or unknown habtiats 
  # and remove plots which couldn’t be classified
  AllTypes <- unique(PlotLocations[,`Expert System`])
  Remove_Habitats <- AllTypes[grep(paste(DropHabitats_plots, collapse = "|") ,AllTypes)] 
  PlotLocations <- copy(PlotLocations[!(`Expert System` %in% c(Remove_Habitats,"","?"))])
  
  # remove plots which are assigned to level 1 or level 2 EUNIS habitat types
  Habitats <- unique(PlotLocations[,`Expert System`])
  Habitats <- Habitats[nchar(Habitats) == 3]
  PlotLocations <- PlotLocations[`Expert System` %in% Habitats]
  
  WriteLogFile(ln= paste0(nrow(PlotLocations), " vegetation plots included in model fitting, including dublicates."),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  WriteLogFile(ln= paste0("Habitat types included in analysis: ", paste((sort(Habitats)), collapse = ", ")),
               file.path(user_dir,base_dir,species_out_dir,LogFile))
  
  
  # clean the table with information about the plots
  PlotLocations <- PlotLocations[,.SD, .SDcols = c("PlotID", "Longitude", "Latitude", "Expert System")]
  colnames(PlotLocations) <- c("PlotID", "x", "y", "HabitatNames")
  
  return(PlotLocations)
  
}

FormatObservations_forRF <- function(PlotLocations, dt_species_data){
  # select the species information of the plots
  plots <- unique(PlotLocations$PlotID)
  dt_species_data <- dt_species_data[PlotID %in% plots]
  # change format of species table to fit input of RF
  dt_species_data[,observed := 1L]
  # split dcast in two parts, as the table exceeds the maximum size for the function
  plots_part1 <- unique(dt_species_data$PlotID)[1:(length(unique(dt_species_data$PlotID))/2)] #part1
  plots_part2 <- unique(dt_species_data$PlotID)[(length(unique(dt_species_data$PlotID))/2 +1) : length(unique(dt_species_data$PlotID))]#part2
  # dcast part 1 and part 2
  part1 <- dcast.data.table(dt_species_data[PlotID %in% plots_part1], 
                            PlotID ~ SpeciesNames, 
                            value.var = "observed", 
                            fill = 0L, 
                            drop = FALSE)
  part2 <- dcast.data.table(dt_species_data[PlotID %in% plots_part2], 
                            PlotID ~ SpeciesNames, 
                            value.var = "observed", 
                            fill = 0L, 
                            drop = FALSE)
  columnnames <- intersect(names(part1),names(part2))
  dt_species_data_wide <- merge(part1,part2, by = columnnames,all =TRUE)  
  # merge plot en species data
  setkey(dt_species_data_wide, PlotID)
  setkey(PlotLocations, PlotID)
  dt_HTandSpecies <- merge(PlotLocations, dt_species_data_wide, by = "PlotID",all =TRUE)
  # change NA-values to 0    
  for(i in 1:ncol(dt_HTandSpecies)){
    set(dt_HTandSpecies,which(is.na(dt_HTandSpecies[[i]])),i,0)}
  
  # remove redundant variables from table
  dt_HTandSpecies <- dt_HTandSpecies[, ':=' (PlotID = NULL, x = NULL, y = NULL)]
  
  # change column types to factors
  columns <- names(dt_HTandSpecies)
  dt_HTandSpecies[,(columns) := lapply(.SD,factor), .SDcols = columns]
  # change names of species so that the model can run.
  dt_HTandSpecies <- FormatSpeciesNames_forRF(dt_HTandSpecies)
  
  rm(part1)
  rm(part2)
  rm(dt_species_data_wide)
  gc()
  
  return(dt_HTandSpecies)
}

FormatSpeciesNames_forRF <- function(dt_species_wide){
  # change names of species so that the model can run.
  if(is.data.table(dt_species_wide) == TRUE){
    columns <- names(dt_species_wide)
  } 
  if(is.character(dt_species_wide) == TRUE){
    columns <- dt_species_wide
  }
  columns_NotSpecies <- columns[columns %in% c("HabitatNames","x","y")] 
  # also list names of species as used in EVA
  #SpeciesNames_EVA <- columns[!columns %in% columns_NotSpecies] 
  new_names <- sub(" ","_" , columns)
  new_names <- sub(" aggr.","_aggr" ,new_names)
  new_names <- sub(" subsp. ","_" ,new_names)
  new_names <- sub(" var. ","_" ,new_names)
  new_names <- sub(" ","_" ,new_names)
  new_names <- sub("-","_" ,new_names)
  
  # replace new names with old names
  if(is.data.table(dt_species_wide) == TRUE){
    names(dt_species_wide) <- new_names}
  if(is.character(dt_species_wide) == TRUE){
    dt_species_wide <- new_names
  }
  
  return(dt_species_wide)
}





# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# 
# This source code of the BioScore model is owned by PBL Netherlands Environmental Assessment Agency. 
# It is not permitted to copy, redistribute, remix, transform, and build upon the material without written approval of PBL. 
# Permission for commercial purposes will not be granted. 
# This code is published to improve the transparency of the models used by PBL, 
# but without any warranty for fitness for any other purpose. 
# After approval of PBL to use the code, PBL will not provide any support.
#
# 
# ====================================================================

