# INLACyANLakeData.R----
#
# Author: Mark Myer
#
# Purpose: To model CyAN lake data "high risk" cell count incidence

# Revision: August 19, 2019
# R Version 3.5.1 Feather Spray


#Make sure you install all of these with install.packages() first
#INLA needs to be installed with install.packages("INLA", repos=c(getOption("repos"), INLA="https://inla.r-inla-download.org/R/testing"), dep=TRUE)
library(readr)
library(dplyr)
library(tidyr)
library(magrittr)
library(rgdal)
library(rgeos)
library(sp)
library(raster)
library(lubridate)
library(INLA) 
INLA:::inla.dynload.workaround()
library(ROCR)
library(maptools)
library(splancs)
library(fields)
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
#setwd("") Set your working drive

#Read in the data 
data_avg <- read.csv("Supplemental4_LakeData.csv")
mutate(data_avg, Week = as.numeric(substr(data_avg$Time, 5, 6)), Year = as.numeric(substr(data_avg$Time, 1, 4)), Bloom = as.numeric(data_avg$Bloom)) -> data_avg

poly <-readOGR(dsn=getwd(), layer="FL_POLYshapefiles") %>%
  spTransform("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs")
statelines <- readOGR(dsn=getwd(), layer="state_48") 

#### split MERIS and OLCI
dat.split2 <- split(data_avg, data_avg$Year <2016)
data_MERIS <- dat.split2[[2]] #All the rows for which the conditional in split() is false
data_OLCI <- dat.split2[[1]] #All the rows for which the conditional in split() is true, so the ones from the max week in 2019

data_avg<-data_OLCI
#Grab out the most recent data from 2019, and see if we can predict it. In this case it's Week 22: May 27 through June 2
dat.split <- split(data_avg, data_avg$Year == 2019 & data_avg$Week == max(dplyr::select(filter(data_avg, Year == 2019), Week)))
data_avg <- dat.split[[1]] #All the rows for which the conditional in split() is false
dat.pred <- dat.split[[2]] #All the rows for which the conditional in split() is true, so the ones from the max week in 2019

#This is where you would put your data frame of 113 observations with forecast weather data to predict a future week, eliminate the split step above and store your new data in dat.pred
#dat.pred <- [your dataframe for predictions here]
data_avg<-data_MERIS
#Partition the eval data into 4/5ths training data and 1/5th validation data
#Note that this partitioning is random, so your results upon replication may vary slightly
dat.eval<- data_avg[sample(nrow(data_avg), (nrow(data_avg)*0.8)),] #22256 observations
not.in <- function(x1,x2,...) { #Function to get the randomly held-out rows from dat that aren't in dat.eval
  x1.p <- do.call ("paste", x1)
  x2.p <- do.call ("paste", x2)
  x1[! x1.p %in% x2.p,]
}
dat.val <- not.in(data_avg, dat.eval) #2775 observations

data_avg <- rbind(dat.eval,dat.val,dat.pred) #Put them back together but with validation and prediction data at the end

#Get some spatial data before scaling
loc = cbind(data_avg$LONG/1000, data_avg$LAT/1000)

#Scale variables
data_avg[,5:10] <- data_avg %>% dplyr::select(5:10) %>% transmute_all(base::scale)

#No time variables, just a simple binomial regression
formula1 = Bloom ~ LONG + LAT + Atemp + Wtemp + Precip + Area +dMean
result1 <- inla(formula1, data=data_avg, family="binomial", 
                control.compute=list(dic=TRUE, cpo=TRUE))
summary(result1)


#Add in an AR1 structured time covariate on week within the year
hyprior=list(theta1 = list(prior="pc.prec", param=c(0.5, 0.05)),
            theta2 = list(prior="pc.cor1", param=c(0.1, 0.9)))
formula2 = Bloom ~ LONG + LAT + Atemp  + Wtemp+ Precip + Area  +dMean f(Week, model="ar1", hyper=hyprior)
result2 <- inla(formula2, data=data_avg,
                          family="binomial",
                          control.predictor=list(compute=TRUE, link = 1),
                          control.inla=list(int.strategy='eb'),
                          control.family=list(link="logit"), 
                          verbose=TRUE, 
                          control.compute=list(dic=TRUE, cpo=TRUE))
summary(result2)

#Model with a spatial component, temporal component by week, and partitioned into training and validation data

max.edge=diff(range(loc[,1]))/15
bound.outer = diff(range(loc[,1]))/3

(mesh<-inla.mesh.2d(boundary=poly,loc=loc,max.edge=c(3,5)*max.edge,cutoff = 3,offset=c(max.edge,bound.outer)))$n

spde<- inla.spde2.matern(mesh) 

A <- inla.spde.make.A(mesh, loc=as.matrix(loc))
idx<-inla.spde.make.index('s',mesh$n)
stk <- inla.stack(data=list(Bloom=c(dat.eval$Bloom, rep(NA, times=nrow(dat.val)+nrow(dat.pred)))), #Put NA for validation and prediction data 
                      A=list(A,1),
                      effects=list(idx, list(data.frame(cbind(data_avg[,c(5:12, 14)])))))

formula3 = Bloom ~ -1 + Atemp +Wtemp + Precip + Area + dMean+ f(Week, model="ar1", hyper=hyprior) +f(s, model=spde)

result3 <- inla(formula3, 
                family='binomial',
                control.family=list(link="logit"),
                data=inla.stack.data(stk),
                control.predictor=list(A=inla.stack.A(stk), compute=TRUE, link=1),
                control.inla=list(int.strategy='eb'),
                verbose=TRUE,
                control.compute=list(dic=TRUE, cpo=TRUE))

summary(result3)

#Extract the spatial results for the range and variance and plot
result3.spatial <- inla.spde2.result(result3, "s", spde)
plot(result3.spatial[["marginals.range.nominal"]][[1]], type = "l", main = "Nominal range, posterior density") #Correlation range around 20 km
plot(result3.spatial[["marginals.variance.nominal"]][[1]], type = "l", main = "Nominal variance, posterior density") #Variance around 13. Means an S.D. of about sqrt(13) ~ 3.6 log-odds. That is a powerful effect. 

mean.range = inla.emarginal(function(x) x, result3.spatial$marginals.range.nominal[[1]])
mean.range2 = inla.emarginal(function(x) x^2, result3.spatial$marginals.range.nominal[[1]])
std.range=sqrt(mean.range2-mean.range^2)
ci.range = inla.qmarginal(c(0.025,0.5,0.975), result3.spatial$marginals.range.nominal[[1]])

mean.variance = inla.emarginal(function(x) x, result3.spatial$marginals.variance.nominal[[1]])
mean.variance2 = inla.emarginal(function(x) x^2, result3.spatial$marginals.variance.nominal[[1]])
std.variance=sqrt(mean.variance2-mean.variance^2)

ci.variance = inla.qmarginal(c(0.025,0.5,0.975), result3.spatial$marginals.variance.nominal[[1]])
#Plot the AR1 time series coefficients
wktrend2 <- result2$summary.random$Week
plot(wktrend2[,1:2], type='l', xlab="Week", ylab="Time Coefficient")
abline(h=0, lty=3)

wktrend3 <- result3$summary.random$Week
plot(wktrend3[,1:2], type='l', xlab="Week", ylab="Time Coefficient")
abline(h=0, lty=3)

#Compute the variance for the AR1 component 'temporal variance'
ar1.variance = inla.tmarginal(function(x) (1/x), result3$marginals.hy[[1]])
mean.ar1.variance = inla.emarginal(function (x) x, ar1.variance)
mean.ar1.variance2 = inla.emarginal(function (x) x^2, ar1.variance)
mean.ar1.stdv = sqrt(mean.ar1.variance2-mean.ar1.variance^2)

#### grab ar1 parameter
ar1=inla.tmarginal(function(x) x, result3$marginals.hyperpar[[2]])
mean.ar1=inla.emarginal(function(x) x, ar1)
mean.ar1.2 = inla.emarginal(function (x) x^2, ar1)
mean.ar1.stdv = sqrt(mean.ar1.2-mean.ar1^2)
ci.ar1 = inla.qmarginal(c(0.025,0.5, 0.975), ar1)

ci.ar1.variance = inla.qmarginal(c(0.025,0.5, 0.975), ar1.variance)
g<-matrix(nrow=4,ncol=5)
colnames(g)<-c("Mean","St","0.025","0.5","0.975")
row.names(g)<-c("Temporal variance","Spatial Variance","Spatial Correlation Range","AR(1)")
g[1,1]<-mean.ar1.variance
g[1,2]<-mean.ar1.stdv
g[1,3:5]<-ci.ar1.variance
g[2,1]<-mean.variance
g[2,2]<-std.variance
g[2,3:5]<-ci.variance
g[3,1]<-mean.range
g[3,2]<-std.range
g[3,3:5]<-ci.range
g[4,1]<-mean.ar1
g[4,2]<-mean.ar1.stdv
g[4,3:5]<-ci.ar1

weekvalues.3 <- data.frame(week = range(data_avg$Week)[1]:range(data_avg$Week)[2], mean = result3$summary.random$Week$mean, lower = result3$summary.random$Week$`0.025quant`, upper = result3$summary.random$Week$`0.975quant`)
tiff(filename="weekvalues.tiff",width=4,height=3,units="in",res=300,pointsize=8,compression="lzw")
ggplot(weekvalues.3) + 
  geom_line(aes(y=mean, x=week, colour= ""), size=1) + 
  geom_ribbon(aes(ymin=lower, ymax=upper, x=week, fill=""), alpha=0.3) +
  geom_hline(yintercept=0, lty=3) +
  scale_colour_manual("", values="blue") +            
  scale_fill_manual("", values="grey50") + 
  labs(x="Week", y="Log Odds") +
  scale_x_continuous(breaks=c(0,12,24,36,48)) +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        legend.position="none",
        axis.title.x = element_text(size=rel(1.2)),#, face="bold"),
        axis.title.y = element_text(size=rel(1.2))#, face="bold")
        
  )
dev.off()

#Plot the SPDE spatial coefficients
#Plot the spatial component of the model
border <- spTransform(poly, CRS("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs"))
florida <- spTransform(statelines[statelines@data$STATE_NAME=="Florida",], CRS("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs"))
ob <- unionSpatialPolygons(border, rep(1, nrow(border))) #This simplifies and unifies the boundary for faster processing
bound<-inla.sp2segment(ob) #Converts the polygon boundary to a inla.mesh.segment object, which is a series of line segments

stepsize <- .1 #This defines the grid as made of squares 0.1Km on a side.
nxy <- round(c(diff(range(bound$loc[,1]))/stepsize, diff(range(bound$loc[,2]))/stepsize)) #Defines the number of pixels in each direction based on the size of the County
projgrid <- inla.mesh.projector(mesh, xlim=range(bound$loc[,1]), ylim=range(bound$loc[,2]), dims=nxy) #Define a projection grid over the dims of our study area
simplemesh=inla.mesh.2d(boundary=bound, max.edge=1e9) #This is for NA'ing out the parts that aren't in the mesh

#Extract the values of the projected field at the centroid of each water body and plot those instead
#This is a more correct representation of the field value used for the predictions at each water body 
centroids <- read.csv("LakeDataCentroids_9-26.csv")[,-1] #Remove the extraneous index column 
centroids$LONG = round(centroids$LONG/1000,1); centroids$LAT = round(centroids$LAT/1000,1)

x=round(projgrid$x,1)
y=round(projgrid$y,1)
all = (expand.grid(x,y,KEEP.OUT.ATTRS = F))
names(all) = c("LONG","LAT")

grid.proj = inla.mesh.project(projgrid, result3$summary.random[['s']][['mean']]) #Project the spatial values onto the grid

plotdata = all %>% mutate(field = as.numeric(grid.proj)) #Transfer the values from the grid to a data.frame

centroid_values = merge(plotdata, centroids, by=c('LONG', 'LAT'))[,-c(1,2)] #Merge the spatial values to each centroid location

border <- merge(border, centroid_values, by='COMID', all=T) #Append the field value from the centroid to each polygon in the 'border' shapefile

nrow(border[which(is.na(border$field)),]) #Check if any lakes didn't get a field value

#Plot lakes colored by the value of the spatial field
par(mai=c(0.62, 0.52, 0.32, 0.62))
#Scaling parameters to zoom the resulting map
scale.parameter = 0.6  # scaling paramter. less than 1 is zooming in, more than 1 zooming out. 
xshift = 200  # Shift to right in map units. 
yshift = 50  # Shift to left in map units. 
original.bbox = border@bbox  # Pass bounding box of your Spatial* Object. 

#Define the changes to map edges based on the above values
edges = original.bbox
edges[1,] <- (edges[1,] - mean(edges[1,])) * scale.parameter + mean(edges[1,]) + xshift
edges[2,] <- (edges[2,] - mean(edges[2,])) * scale.parameter + mean(edges[2,]) + yshift

#Define some map objects to stick on the image
scale = list("SpatialPolygonsRescale", layout.scale.bar(), scale = 50, fill = c("transparent","black"), offset = c(1375, 575))
arrow = list("SpatialPolygonsRescale", layout.north.arrow(), offset = c(1500,810), scale = 25)
text = list("sp.text", c(1400, 585), "50 km")

#Create the plot
#tiff(filename="SpatialEffect.tiff",width=6,height=5,units="in",res=300,pointsize=8,compression="lzw")
spplot(border, "field", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
       sp.layout=list(scale, arrow, text, florida))
  grid.text("", x=unit(0.99, "npc"), y=unit(0.50, "npc"), rot=90)
#dev.off()

rm(grid.proj, projgrid, plotdata, all) #Save some memory

#Check the predictive power of the model on 20% held-out observations, and on most recent predictions
responses<-result3$summary.fitted.values$mean[1:nrow(data_avg)]
holdout.vals<-responses[(nrow(dat.eval)+1):(nrow(data_avg)-nrow(dat.pred))] #predicted values from the index numbers of the val data (recall that validation holdouts were appended between eval and preds)
holdout.obs<-dat.val$Bloom

prediction.vals<-responses[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] #predicted values from the index numbers of the pred data

prediction.vals.025 <- result3$summary.fitted.values$`0.025quant`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] #Obtain the 95% credible interval for predictions
prediction.vals.975 <- result3$summary.fitted.values$`0.975quant`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)]
prediction.vals.sd<-result3$summary.fitted.values$`sd`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)]
prediction.vals.se<-prediction.vals.sd/sqrt(103)
prediction.obs <- dat.pred$Bloom

#Create an ROC curve, determine the optimal cutoff point using Youden's index, and then calculate AUC value with optimal cutoff
roc.val<-prediction(holdout.vals, holdout.obs)
roc.perf.val<-performance(roc.val, measure="tpr", x.measure="fpr") #"tpr" means true positive rate, "fpr" is false positive rate

roc.pred<-prediction(prediction.vals, prediction.obs)
roc.perf.pred<-performance(roc.pred, measure="tpr", x.measure="fpr") #"tpr" means true positive rate, "fpr" is false positive rate

#This function finds the optimal cutoff point according to Youden's index (this balances type I and II errors while attempting to minimize both)
opt.cut = function(perf, val){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, val@cutoffs)
}

cutoff <- opt.cut(roc.perf.val, roc.val)
cutoff[3] #Prints the optimal cutoff point ... #0.2581 OLCI, 0.29 MERIS

#Obtain the AUC value
auc.perf.val = performance(roc.val, measure = "auc")
auc.perf.pred = performance(roc.pred, measure = "auc")
#auc.perf.val@y.values
#auc.perf.pred@y.values

#Plot ROC curve with optimal cutoff 
tiff(filename="ROCRFig.tiff",width=3,height=3,units="in",res=300,pointsize=8,compression="lzw")
plot(roc.perf.val, colorize=T, lwd=2, lty=1, cex.lab=1.2, colorkey.pos="right")
points(x=1.01-cutoff[2], y=cutoff[1], cex=2, pch=20, col="black") #0.01 is added to the x coordinate to scoot the point a little over so it's centered on the line
abline(a=0,b=1)
dev.off()

#Make a confusion matrix with our selected optimal cutoff point
confmatrix.val<-as.matrix(table(factor(holdout.vals>=cutoff[3], levels=c(F,T)), holdout.obs))
confmatrix.pred<-as.matrix(table(factor(prediction.vals>=cutoff[3], levels=c(F,T)), prediction.obs))
confmatrix.val
confmatrix.pred

spec.val<-confmatrix.val[2,2]/(confmatrix.val[1,2]+confmatrix.val[2,2])
sens.val<-confmatrix.val[1,1]/(confmatrix.val[1,1]+confmatrix.val[2,1])
acc.val<-(confmatrix.val[2,2]+confmatrix.val[1,1])/(confmatrix.val[1,2]+confmatrix.val[2,2]+confmatrix.val[1,1]+confmatrix.val[2,1])
auc.perf.val@y.values
spec.val
sens.val
acc.val

spec.pred<-confmatrix.pred[2,2]/(confmatrix.pred[1,2]+confmatrix.pred[2,2])
sens.pred<-confmatrix.pred[1,1]/(confmatrix.pred[1,1]+confmatrix.pred[2,1])
acc.pred<-(confmatrix.pred[2,2]+confmatrix.pred[1,1])/(confmatrix.pred[1,2]+confmatrix.pred[2,2]+confmatrix.pred[1,1]+confmatrix.pred[2,1])
auc.perf.pred@y.values
spec.pred
sens.pred
acc.pred

m<-matrix(ncol=2,nrow=4)
colnames(m)<-c("Holdout", "Prediction")
row.names(m)<-c("AUC","Spec","Sens","ACC")
m[1,1]<-auc.perf.val@y.values[[1]]
m[1,2]<-auc.perf.pred@y.values[[1]]
m[2,1]<-spec.val
m[2,2]<-spec.pred
m[3,1]<-sens.val
m[3,2]<-sens.pred
m[4,1]<-acc.val
m[4,2]<-acc.pred
#Visualize the most-recent-week predictions by plotting the logistic odds of a 100,000+ cells/mL bloom 
prediction.comids <- data.frame(COMID = dat.pred$COMID, Bloom = prediction.vals, Bloom.025 = prediction.vals.025, Bloom.975 = prediction.vals.975)
prediction.comids2 <- data.frame(COMID = dat.pred$COMID, Bloom = prediction.vals, Bloom.SE = prediction.vals.se,Bloom.SD=prediction.vals.sd,OBS=prediction.obs)

border <- merge(border, prediction.comids2, by='COMID', all=T) #Append the prediction value to each polygon in the 'border' shapefile

#Plot the predictions
tiff(filename="PredictionMap.tiff",width=18,height=5,units="in",res=300,pointsize=8,compression="lzw")
# grid.arrange(
#   spplot(border, "Bloom.025", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="0.025 Quantile", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
#        sp.layout=list(scale, arrow, text, florida)),
#   spplot(border, "Bloom", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="Odds of a Severe Bloom\n5/27/19 - 6/2/19", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
#        sp.layout=list(scale, arrow, text, florida)),
#   spplot(border, "Bloom.975", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="0.975 Quantile", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
#        sp.layout=list(scale, arrow, text, florida)),
#   ncol=3)

########Plot predictions and SD ####################################
grid.arrange(
  spplot(border, "Bloom", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
         sp.layout=list(scale, arrow, text, florida)),
  spplot(border, "Bloom.SD", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], 
         sp.layout=list(scale, arrow, text, florida)),
  ncol=2)

grid.text("Spatial Effect (log-odds)", x=unit(0.99, "npc"), y=unit(0.50, "npc"), rot=90) #Include this if you want, I think it's redundant
dev.off()

#To aid in interpreting LONG and LAT in result1, whose units are standard deviations of distance from the centroid of all observations, plot a map
centerpoint <- data.frame(LONG = attributes(data_avg$LONG)$'scaled:center',
                          LAT = attributes(data_avg$LAT)$'scaled:center')
coordinates(centerpoint) <- ~LONG+LAT
proj4string(centerpoint) <- crs(statelines)

#tiff(filename="coordinate_sd_illustration.tiff",width=4,height=4,units="in",res=300,pointsize=8,compression="lzw")
plot(statelines[statelines@data$STATE_NAME=="Florida",])
plot(centerpoint, add=TRUE)
#Add segments that represent 1 unit distance for LONG and LAT coefficients
segments(x0=rep(centerpoint@coords[1], times=2),
         y0=rep(centerpoint@coords[2], times=2),
         x1=c(centerpoint@coords[1] + attributes(data_avg$LONG)$'scaled:scale', centerpoint@coords[1]), 
         y1=c(centerpoint@coords[2], centerpoint@coords[2] + attributes(data_avg$LAT)$'scaled:scale'), 
         col="blue", 
         lwd=3)
#dev.off()