# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# See the copyright notice at the end of this file.
#
# ====================================================================


# Functions for loading and formatting the environmental data, including the following functions:
#     1. 	SelectPlots: Make a plot selection
#     2. 	ExtractVarValues: 
#         Function to load environmental raster with an ecosystem specific list of rasters to load 
#       	Extract environmental values at locations of plots with an ecosystem specific list of rasters
#       	Write table to hard disk
#     3. 	VIFanalysis: Keep only environmental variables in table which have a VIF < 5 or 10.  




SelectPlots <- function(species_plots){
  
  WriteLogFile(paste(species_out_dir,LogFile,sep="/"),ln="Start formatting variables for fitting of regressions.")
  
  # load file with all plot locations
  PlotLocations <- fread(species_plots, quote="", sep = "\t")  
  
  # remove plots which are located in maritime wetlands, fresh water, marine water, man-made or unknown habtiats 
  AllTypes <- unique(PlotLocations[,`Expert System`])
  Remove_Habitats <- AllTypes[grep(paste(DropHabitats_plots, collapse = "|") ,AllTypes)] 
  PlotLocations <- copy(PlotLocations[!(`Expert System` %in% c(Remove_Habitats,"","?"))])
  
  # Plots measured between 1990 and 2020 are selected.
  Year_of_recording <- as.numeric(substr(PlotLocations$`Date of recording`,7,10))
  PlotLocations <- PlotLocations[, Year := Year_of_recording]
  PlotLocations <- PlotLocations[Year >= MinYear]
  PlotLocations <- PlotLocations[`Location uncertainty (m)` <= 1000 | is.na(`Location uncertainty (m)`)]

  return(PlotLocations)
  
}


ExtractVarValues <- function(TableWithOverviewVariables){
  
  OverviewVariables <- fread(file.path(user_dir, base_dir,var_fit_dir,TableWithOverviewVariables))
  EnvVar <- stack(lapply(OverviewVariables$current, FUN = function(x){
    raster(file.path(user_dir, base_dir,var_fit_dir,x))
  }))
  names(EnvVar) <- OverviewVariables$SharedNameVariable
  
  # define the x and y coordinates of the species observations
  myRespXY <- SpatialPointsDataFrame( PlotLocations[,c("Longitude","Latitude")], 
                                      PlotLocations[,c("PlotID","Year","Location uncertainty (m)")],
                                      proj4string = CRS("EPSG:4326"))
  # change coordinate system to overlap with variables
  myRespXY <- spTransform(myRespXY, crs(EnvVar))
  
  # extract the value of environmental variables at a point location and add it to the coordinates
  PlotLocations_Env <- raster::extract( EnvVar, myRespXY, method='simple', sp=TRUE)
  PlotLocations_Env <- as.data.table(PlotLocations_Env)
  # give the columns of the coordinates a useful name
  names(PlotLocations_Env)[(length(names(PlotLocations_Env)) -1): length(names(PlotLocations_Env))] <- c("Longitude", "Latitude")
  
  # clean table 
  PlotLocations_Env <- na.omit(PlotLocations_Env, cols = OverviewVariables$SharedNameVariable)
  
  # write table to hard drive for further analysis and to be able to skip the previous formatting for every run.
  write.csv(PlotLocations_Env, file = file.path(user_dir, base_dir, var_fit_dir, "AllVars_ForPlots.csv"))
  
  return(PlotLocations_Env)
}


VIFanalysis <- function(VIFthresholdValue){
    # select only the environmental variables from the table 
    OverviewVariables <- fread(file.path(user_dir, base_dir,var_fit_dir,TableWithOverviewVariables))
    VIFinput <- PlotLocations_Env[, OverviewVariables$SharedNameVariable, with=FALSE]
  
  # calculate correlations
  cortable <- round(cor(VIFinput,y=NULL,use="complete.obs",method="spearman"),2)
  cortable <- as.data.table(cortable, keep.rownames = TRUE)
  write.csv(cortable, file.path(species_out_dir,"CorrelationPressures.csv"))
  
  # calculate the VIF values
  set.seed(99) #set seed
  VIFresults <- usdm::vif(as.data.frame(VIFinput ))
  # highest VIF value
  VIFmax <- max(VIFresults$VIF)
  
  # remove variables from the analysis which have a VIF value larger then the threshold value
  while(VIFmax >= VIFthresholdValue){
    # list variable which is removed
    RemoveVariable <- VIFresults$Variables[which(VIFresults$VIF == VIFmax)]
    print(paste0("remove ", RemoveVariable))
    WriteLogFile(paste(species_out_dir,LogFile,sep="/"),ln= paste0("The variable ", RemoveVariable," has a VIF >= ", VIFthreshold,". Remove from analysis."))
    # remove variable from table
    VIFresults <-  VIFresults[-which(VIFresults$VIF == VIFmax),]
    VIFinput <- VIFinput[,VIFresults$Variables, with=FALSE]
    # calculate the VIF values again
    VIFresults <- usdm::vif(as.data.frame(VIFinput ))
    # highest VIF value
    VIFmax <- max(VIFresults$VIF)
  }

  # make a list of variables left after VIF selection
  Variables <- VIFresults$Variables
    
  # select only these variables which are included in the analysis.
  AllVars <- PlotLocations_Env[ , c(VIFresults$Variables, "PlotID", "Year", "Longitude", "Latitude"), with=FALSE]
  
  return(list(AllVars = AllVars, Variables = Variables))
  
}


# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# 
# This source code of the BioScore model is owned by PBL Netherlands Environmental Assessment Agency. 
# It is not permitted to copy, redistribute, remix, transform, and build upon the material without written approval of PBL. 
# Permission for commercial purposes will not be granted. 
# This code is published to improve the transparency of the models used by PBL, 
# but without any warranty for fitness for any other purpose. 
# After approval of PBL to use the code, PBL will not provide any support.
# 
# ====================================================================


