# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# See the copyright notice at the end of this file.
#
# ====================================================================


# functions for selecting presences and absences
#   1. a function to select the observations of a species from the species data table
#   2. a function to make an image of the plot locations


SelectSpeciesObservations <- function(myRespName){
  # load species presence data
  PresenceData <-  SpeciesData$dt_species_data[Species == myRespName] 
  
  # merge species data and env data
  setkey(PresenceData,PlotID)
  setkey(VariableData$AllVars,PlotID)
  SpeciesEnvVar <- merge(VariableData$AllVars, PresenceData, by = "PlotID", all.x = TRUE)
  
  # make new variables called x en y
  SpeciesEnvVar$x <- SpeciesEnvVar$Longitude
  SpeciesEnvVar$y <- SpeciesEnvVar$Latitude 
  
  # add a new variable which shows whether a species is observed
  SpeciesEnvVar[, observed := 0]
  SpeciesEnvVar[Species == myRespName, observed := 1]
  
  
  # write error to logfile, when there are no presence values in table: SpeciesEnvVar
  if(sum(SpeciesEnvVar$observed) == 0) { 
    WriteLogFile(paste(user_dir, base_dir, species_out_dir,LogFile,sep="/"),ln=paste0("ERROR: no presence values selected for ", myRespName,"."))
  }
  
  # make a selection of the presence records. max 1 random presence value per km cell
  # select only presence records
  Presences <- subset(SpeciesEnvVar, SpeciesEnvVar$observed == 1) 
  
  # first make a unique number per 1km grid cell. 
  Presences$xRound <- round((Presences$x -500)/1000,0)
  Presences$yRound <- round((Presences$y-500)/1000,0)
  Presences$GridNr <- paste0(Presences$xRound,Presences$yRound)
  
  # Randomly sort observation table
  set.seed(99) #set seed 
  ObsRandom <- Presences[sample(nrow(Presences)),]
  # make a list of all grid cell numbers where the species is present
  GridNrList <- as.character(ObsRandom$GridNr)
  # make a list with all unique grid cell numbers
  GridUnique <- unique(GridNrList)
  # Note the number of the row with the first found grid cell number in it listed in 'GridUnique'.
  Rownr <- match(GridUnique,GridNrList)
  
  # make a table with only the selected rows
  ObsSelect_Presences <- ObsRandom[Rownr,]
  
  # make fit and test set of resp. 80% and 20% of the presence
  # Randomly sort table with presences
  set.seed(1) #set seed
  presences_Random <- ObsSelect_Presences[sample(nrow(ObsSelect_Presences)),]
  
  # select first 80% of rows for fitting model
  Fit_presences_Random <- presences_Random[1:floor(nrow(presences_Random)*0.8),]
  # select last 20% of rows for testing data
  Test_presences_Random <- presences_Random[c((floor(nrow(presences_Random)*0.8)+1):nrow(presences_Random)),]
  
  # count nr of presences
  NPres_fit <- nrow( Fit_presences_Random)
  NPres_test <- nrow(Test_presences_Random)
  
  min_observations <- (length(VariableData$Variables)* min_obs_per_var) / 0.8
  
  #### !!!!ONLY CONTINUE MODELLING IF THERE ARE ENOUGH OBSERVATION
  RUN <- sum(NPres_fit,NPres_test) >= min_observations
  if(RUN == FALSE){
    WriteLogFile(paste(user_dir, base_dir, species_out_dir,LogFile,sep="/"),ln=paste0("ERROR: only ", sum(NPres_fit,NPres_test)," presence values selected for ", myRespName,". Stop script!"))
  }  
  
  # random selection of absence values. 
  # select a random observation within each 1000m grid cell.
  
  # select only abs
  Absences <- subset(SpeciesEnvVar, SpeciesEnvVar$observed == 0) 
  
  # first make a unique number per 5km grid cell. 
  Absences$xRound <- round((Absences$x -500)/1000,0)
  Absences$yRound <- round((Absences$y-500)/1000,0)
  Absences$GridNr <- paste0(Absences$xRound,Absences$yRound)
  
  # remove absences located in grid cells in which a presence is recorded. 
  Absences <- Absences[!GridNr %in% unique(Presences$GridNr)] 
  
  # Randomly sort observation table
  set.seed(88) #set seed 
  ObsRandom <- Absences[sample(nrow(Absences)),]
  # make a list of all grid cell numbers where the species is present
  GridNrList <- as.character(ObsRandom$GridNr)
  # make a list with all unique grid cell numbers
  GridUnique <- unique(GridNrList)
  # Note the number of the row with the first found grid cell number in it listed in 'GridUnique'.
  Rownr <- match(GridUnique,GridNrList)
  # make a table with only the selected rows
  ObsSelect_Absences <- ObsRandom[Rownr,]
  
  # for GLMM and GAMM 10000 absences are selected, or an amount equal to the number of presences when nr of pres > 10000
  ObsSelect_Absences_10k <- ObsSelect_Absences[1: max(10000,(NPres_fit+NPres_test)),] 
  # for BRT 1000 absences are selected, or an amount equal to the number of presences when nr of pres > 1000
  ObsSelect_Absences_1k <- ObsSelect_Absences[1: max(1000,(NPres_fit+NPres_test)),]
  if(modeltesting == TRUE){
    ObsSelect_Absences_10k <- ObsSelect_Absences[1: 1000,] 
    ObsSelect_Absences_1k <- ObsSelect_Absences[1: 100,]  
  }
  
  
  
  # Randomly sort table with absences of GAM and GLM
  set.seed(2) #set seed 
  Absences_10k_random <- ObsSelect_Absences_10k[sample(nrow(ObsSelect_Absences_10k)),]
  # select first 80% of rows for fitting model
  Fit_Absences_GLM_GAM <- Absences_10k_random[1:floor(nrow(Absences_10k_random)*0.8),]
  # select last 20% of rows for testing data
  Test_Absences_GLM_GAM <- Absences_10k_random[c((floor(nrow(Absences_10k_random)*0.8)+1):nrow(Absences_10k_random)),]
  
  # Randomly sort table with absences of BRT
  set.seed(3) #set seed 
  Absences_1k_random <- ObsSelect_Absences_1k[sample(nrow(ObsSelect_Absences_1k)),]
  # select first 80% of rows for fitting model
  Fit_Absences_BRT <- Absences_1k_random[1:floor(nrow(Absences_1k_random)*0.8),]
  # select last 20% of rows for testing data
  # count nr of absences and presences
  NAbs_GLM_GAM_fit <- nrow(Fit_Absences_GLM_GAM)  
  NAbs_GLM_GAM_test <- nrow(Test_Absences_GLM_GAM)
  NAbs_BRT_fit <- nrow(Fit_Absences_BRT)  
  
  # append the tables with the presence and the absence values
  AllData_GLM_GAM_fit <- rbind( Fit_presences_Random, Fit_Absences_GLM_GAM)
  AllData_test <- rbind( Test_presences_Random, Test_Absences_GLM_GAM)
  AllData_BRT_fit <- rbind( Fit_presences_Random, Fit_Absences_BRT)
  
  return(list(AllData_GLM_GAM_fit = AllData_GLM_GAM_fit,
         AllData_BRT_fit = AllData_BRT_fit,
         AllData_test = AllData_test,
         NPres_fit = NPres_fit,
         NPres_test = NPres_test,
         NAbs_GLM_GAM_fit = NAbs_GLM_GAM_fit,
         NAbs_GLM_GAM_test = NAbs_GLM_GAM_test,
         NAbs_BRT_fit = NAbs_BRT_fit,
         ContinueAnalysis = RUN )
         )
  
}



ImagesOfPlotLocations <- function(myRespName, ObservationData,PlotLocations.dir){
  
  if (!file.exists(file.path(PlotLocations.dir, myRespName))){ 
    dir.create(file.path(PlotLocations.dir, myRespName))}
  
  
  # save plots locations in image
  # GLMM and GAMM
  resp.filename <- file.path(PlotLocations.dir, myRespName, paste0("Plots_fit_GLM_GAM", myRespName, ".png"))
  png(file = resp.filename,
      width = 200, height = 150, units = 'mm', pointsize = 8, res = 400)
  
  par(mar=c(4,4,4,4))
  
  plotlocations <- plot(ObservationData$AllData_GLM_GAM_fit[observed == 0,x],ObservationData$AllData_GLM_GAM_fit[observed == 0,y], col = "red", 
                        xlim = c(2500000, 7200000),
                        ylim = c(1400000, 5500000),
                        main = paste("plots used to fit GLM and GAM of ", myRespName),
                        xlab= "x-coordinate",
                        ylab= "y-coordinate")
  plotlocations <- points(ObservationData$AllData_GLM_GAM_fit[observed == 1,x],ObservationData$AllData_GLM_GAM_fit[observed == 1,y], col = "blue", 
                          xlim = c(2500000, 7200000),
                          ylim = c(1400000, 5500000))
  legend("right", c("Absence", "Presence"), fill=c("red", "blue"))
  dev.off()
  
  # BRT
  resp.filename <- file.path(PlotLocations.dir, myRespName, paste0("Plots_fit_BRT_", myRespName, ".png"))
  png(file = resp.filename,
      width = 200, height = 150, units = 'mm', pointsize = 8, res = 400)
  
  par(mar=c(4,4,4,4))
  
  plotlocations <- plot(ObservationData$AllData_BRT_fit[observed == 0,x],ObservationData$AllData_BRT_fit[observed == 0,y], col = "red", 
                        xlim = c(2500000, 7200000),
                        ylim = c(1400000, 5500000),
                        main = paste("plots used to fit BRT of ", myRespName),
                        xlab= "x-coordinate",
                        ylab= "y-coordinate")
  plotlocations <- points(ObservationData$AllData_BRT_fit[observed == 1,x],ObservationData$AllData_BRT_fit[observed == 1,y], col = "blue", 
                          xlim = c(2500000, 7200000),
                          ylim = c(1400000, 5500000))
  legend("right", c("Absence", "Presence"), fill=c("red", "blue"))
  dev.off()
  
  
  # TESTING DATASET
  resp.filename <- file.path(PlotLocations.dir, myRespName, paste0("Plots_TEST_", myRespName, ".png"))
  png(file = resp.filename,
      width = 200, height = 150, units = 'mm', pointsize = 8, res = 400)
  
  par(mar=c(4,4,4,4))
  
  plotlocations <- plot(ObservationData$AllData_test[observed == 0,x],ObservationData$AllData_test[observed == 0,y], col = "red", 
                        xlim = c(2500000, 7200000),
                        ylim = c(1400000, 5500000),
                        main = paste("plots used to test BRT, GLM and GAM of ", myRespName),
                        xlab= "x-coordinate",
                        ylab= "y-coordinate")
  plotlocations <- points(ObservationData$AllData_test[observed == 1,x],ObservationData$AllData_test[observed == 1,y], col = "blue", 
                          xlim = c(2500000, 7200000),
                          ylim = c(1400000, 5500000))
  legend("right", c("Absence", "Presence"), fill=c("red", "blue"))
  dev.off()
}

# ====================================================================
#
# Copyright 2025, PBL Netherlands Environmental Assessment Agency
# 
# This source code of the BioScore model is owned by PBL Netherlands Environmental Assessment Agency. 
# It is not permitted to copy, redistribute, remix, transform, and build upon the material without written approval of PBL. 
# Permission for commercial purposes will not be granted. 
# This code is published to improve the transparency of the models used by PBL, 
# but without any warranty for fitness for any other purpose. 
# After approval of PBL to use the code, PBL will not provide any support.
# 
# ====================================================================

