# INLACyANLakeData.R---- # # Author: Mark Myer # # Purpose: To model CyAN lake data "high risk" cell count incidence # Revision: August 19, 2019 # R Version 3.5.1 Feather Spray #Make sure you install all of these with install.packages() first #INLA needs to be installed with install.packages("INLA", repos=c(getOption("repos"), INLA="https://inla.r-inla-download.org/R/testing"), dep=TRUE) library(readr) library(dplyr) library(tidyr) library(magrittr) library(rgdal) library(rgeos) library(sp) library(raster) library(lubridate) library(INLA) INLA:::inla.dynload.workaround() library(ROCR) library(maptools) library(splancs) library(fields) library(ggplot2) library(RColorBrewer) library(gridExtra) #setwd("") Set your working drive #Read in the data data_avg <- read.csv("Supplemental4_LakeData.csv") mutate(data_avg, Week = as.numeric(substr(data_avg$Time, 5, 6)), Year = as.numeric(substr(data_avg$Time, 1, 4)), Bloom = as.numeric(data_avg$Bloom)) -> data_avg poly <-readOGR(dsn=getwd(), layer="FL_POLYshapefiles") %>% spTransform("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs") statelines <- readOGR(dsn=getwd(), layer="state_48") #### split MERIS and OLCI dat.split2 <- split(data_avg, data_avg$Year <2016) data_MERIS <- dat.split2[[2]] #All the rows for which the conditional in split() is false data_OLCI <- dat.split2[[1]] #All the rows for which the conditional in split() is true, so the ones from the max week in 2019 data_avg<-data_OLCI #Grab out the most recent data from 2019, and see if we can predict it. In this case it's Week 22: May 27 through June 2 dat.split <- split(data_avg, data_avg$Year == 2019 & data_avg$Week == max(dplyr::select(filter(data_avg, Year == 2019), Week))) data_avg <- dat.split[[1]] #All the rows for which the conditional in split() is false dat.pred <- dat.split[[2]] #All the rows for which the conditional in split() is true, so the ones from the max week in 2019 #This is where you would put your data frame of 113 observations with forecast weather data to predict a future week, eliminate the split step above and store your new data in dat.pred #dat.pred <- [your dataframe for predictions here] data_avg<-data_MERIS #Partition the eval data into 4/5ths training data and 1/5th validation data #Note that this partitioning is random, so your results upon replication may vary slightly dat.eval<- data_avg[sample(nrow(data_avg), (nrow(data_avg)*0.8)),] #22256 observations not.in <- function(x1,x2,...) { #Function to get the randomly held-out rows from dat that aren't in dat.eval x1.p <- do.call ("paste", x1) x2.p <- do.call ("paste", x2) x1[! x1.p %in% x2.p,] } dat.val <- not.in(data_avg, dat.eval) #2775 observations data_avg <- rbind(dat.eval,dat.val,dat.pred) #Put them back together but with validation and prediction data at the end #Get some spatial data before scaling loc = cbind(data_avg$LONG/1000, data_avg$LAT/1000) #Scale variables data_avg[,5:10] <- data_avg %>% dplyr::select(5:10) %>% transmute_all(base::scale) #No time variables, just a simple binomial regression formula1 = Bloom ~ LONG + LAT + Atemp + Wtemp + Precip + Area +dMean result1 <- inla(formula1, data=data_avg, family="binomial", control.compute=list(dic=TRUE, cpo=TRUE)) summary(result1) #Add in an AR1 structured time covariate on week within the year hyprior=list(theta1 = list(prior="pc.prec", param=c(0.5, 0.05)), theta2 = list(prior="pc.cor1", param=c(0.1, 0.9))) formula2 = Bloom ~ LONG + LAT + Atemp + Wtemp+ Precip + Area +dMean f(Week, model="ar1", hyper=hyprior) result2 <- inla(formula2, data=data_avg, family="binomial", control.predictor=list(compute=TRUE, link = 1), control.inla=list(int.strategy='eb'), control.family=list(link="logit"), verbose=TRUE, control.compute=list(dic=TRUE, cpo=TRUE)) summary(result2) #Model with a spatial component, temporal component by week, and partitioned into training and validation data max.edge=diff(range(loc[,1]))/15 bound.outer = diff(range(loc[,1]))/3 (mesh<-inla.mesh.2d(boundary=poly,loc=loc,max.edge=c(3,5)*max.edge,cutoff = 3,offset=c(max.edge,bound.outer)))$n spde<- inla.spde2.matern(mesh) A <- inla.spde.make.A(mesh, loc=as.matrix(loc)) idx<-inla.spde.make.index('s',mesh$n) stk <- inla.stack(data=list(Bloom=c(dat.eval$Bloom, rep(NA, times=nrow(dat.val)+nrow(dat.pred)))), #Put NA for validation and prediction data A=list(A,1), effects=list(idx, list(data.frame(cbind(data_avg[,c(5:12, 14)]))))) formula3 = Bloom ~ -1 + Atemp +Wtemp + Precip + Area + dMean+ f(Week, model="ar1", hyper=hyprior) +f(s, model=spde) result3 <- inla(formula3, family='binomial', control.family=list(link="logit"), data=inla.stack.data(stk), control.predictor=list(A=inla.stack.A(stk), compute=TRUE, link=1), control.inla=list(int.strategy='eb'), verbose=TRUE, control.compute=list(dic=TRUE, cpo=TRUE)) summary(result3) #Extract the spatial results for the range and variance and plot result3.spatial <- inla.spde2.result(result3, "s", spde) plot(result3.spatial[["marginals.range.nominal"]][[1]], type = "l", main = "Nominal range, posterior density") #Correlation range around 20 km plot(result3.spatial[["marginals.variance.nominal"]][[1]], type = "l", main = "Nominal variance, posterior density") #Variance around 13. Means an S.D. of about sqrt(13) ~ 3.6 log-odds. That is a powerful effect. mean.range = inla.emarginal(function(x) x, result3.spatial$marginals.range.nominal[[1]]) mean.range2 = inla.emarginal(function(x) x^2, result3.spatial$marginals.range.nominal[[1]]) std.range=sqrt(mean.range2-mean.range^2) ci.range = inla.qmarginal(c(0.025,0.5,0.975), result3.spatial$marginals.range.nominal[[1]]) mean.variance = inla.emarginal(function(x) x, result3.spatial$marginals.variance.nominal[[1]]) mean.variance2 = inla.emarginal(function(x) x^2, result3.spatial$marginals.variance.nominal[[1]]) std.variance=sqrt(mean.variance2-mean.variance^2) ci.variance = inla.qmarginal(c(0.025,0.5,0.975), result3.spatial$marginals.variance.nominal[[1]]) #Plot the AR1 time series coefficients wktrend2 <- result2$summary.random$Week plot(wktrend2[,1:2], type='l', xlab="Week", ylab="Time Coefficient") abline(h=0, lty=3) wktrend3 <- result3$summary.random$Week plot(wktrend3[,1:2], type='l', xlab="Week", ylab="Time Coefficient") abline(h=0, lty=3) #Compute the variance for the AR1 component 'temporal variance' ar1.variance = inla.tmarginal(function(x) (1/x), result3$marginals.hy[[1]]) mean.ar1.variance = inla.emarginal(function (x) x, ar1.variance) mean.ar1.variance2 = inla.emarginal(function (x) x^2, ar1.variance) mean.ar1.stdv = sqrt(mean.ar1.variance2-mean.ar1.variance^2) #### grab ar1 parameter ar1=inla.tmarginal(function(x) x, result3$marginals.hyperpar[[2]]) mean.ar1=inla.emarginal(function(x) x, ar1) mean.ar1.2 = inla.emarginal(function (x) x^2, ar1) mean.ar1.stdv = sqrt(mean.ar1.2-mean.ar1^2) ci.ar1 = inla.qmarginal(c(0.025,0.5, 0.975), ar1) ci.ar1.variance = inla.qmarginal(c(0.025,0.5, 0.975), ar1.variance) g<-matrix(nrow=4,ncol=5) colnames(g)<-c("Mean","St","0.025","0.5","0.975") row.names(g)<-c("Temporal variance","Spatial Variance","Spatial Correlation Range","AR(1)") g[1,1]<-mean.ar1.variance g[1,2]<-mean.ar1.stdv g[1,3:5]<-ci.ar1.variance g[2,1]<-mean.variance g[2,2]<-std.variance g[2,3:5]<-ci.variance g[3,1]<-mean.range g[3,2]<-std.range g[3,3:5]<-ci.range g[4,1]<-mean.ar1 g[4,2]<-mean.ar1.stdv g[4,3:5]<-ci.ar1 weekvalues.3 <- data.frame(week = range(data_avg$Week)[1]:range(data_avg$Week)[2], mean = result3$summary.random$Week$mean, lower = result3$summary.random$Week$`0.025quant`, upper = result3$summary.random$Week$`0.975quant`) tiff(filename="weekvalues.tiff",width=4,height=3,units="in",res=300,pointsize=8,compression="lzw") ggplot(weekvalues.3) + geom_line(aes(y=mean, x=week, colour= ""), size=1) + geom_ribbon(aes(ymin=lower, ymax=upper, x=week, fill=""), alpha=0.3) + geom_hline(yintercept=0, lty=3) + scale_colour_manual("", values="blue") + scale_fill_manual("", values="grey50") + labs(x="Week", y="Log Odds") + scale_x_continuous(breaks=c(0,12,24,36,48)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), legend.position="none", axis.title.x = element_text(size=rel(1.2)),#, face="bold"), axis.title.y = element_text(size=rel(1.2))#, face="bold") ) dev.off() #Plot the SPDE spatial coefficients #Plot the spatial component of the model border <- spTransform(poly, CRS("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs")) florida <- spTransform(statelines[statelines@data$STATE_NAME=="Florida",], CRS("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +units=km +no_defs")) ob <- unionSpatialPolygons(border, rep(1, nrow(border))) #This simplifies and unifies the boundary for faster processing bound<-inla.sp2segment(ob) #Converts the polygon boundary to a inla.mesh.segment object, which is a series of line segments stepsize <- .1 #This defines the grid as made of squares 0.1Km on a side. nxy <- round(c(diff(range(bound$loc[,1]))/stepsize, diff(range(bound$loc[,2]))/stepsize)) #Defines the number of pixels in each direction based on the size of the County projgrid <- inla.mesh.projector(mesh, xlim=range(bound$loc[,1]), ylim=range(bound$loc[,2]), dims=nxy) #Define a projection grid over the dims of our study area simplemesh=inla.mesh.2d(boundary=bound, max.edge=1e9) #This is for NA'ing out the parts that aren't in the mesh #Extract the values of the projected field at the centroid of each water body and plot those instead #This is a more correct representation of the field value used for the predictions at each water body centroids <- read.csv("LakeDataCentroids_9-26.csv")[,-1] #Remove the extraneous index column centroids$LONG = round(centroids$LONG/1000,1); centroids$LAT = round(centroids$LAT/1000,1) x=round(projgrid$x,1) y=round(projgrid$y,1) all = (expand.grid(x,y,KEEP.OUT.ATTRS = F)) names(all) = c("LONG","LAT") grid.proj = inla.mesh.project(projgrid, result3$summary.random[['s']][['mean']]) #Project the spatial values onto the grid plotdata = all %>% mutate(field = as.numeric(grid.proj)) #Transfer the values from the grid to a data.frame centroid_values = merge(plotdata, centroids, by=c('LONG', 'LAT'))[,-c(1,2)] #Merge the spatial values to each centroid location border <- merge(border, centroid_values, by='COMID', all=T) #Append the field value from the centroid to each polygon in the 'border' shapefile nrow(border[which(is.na(border$field)),]) #Check if any lakes didn't get a field value #Plot lakes colored by the value of the spatial field par(mai=c(0.62, 0.52, 0.32, 0.62)) #Scaling parameters to zoom the resulting map scale.parameter = 0.6 # scaling paramter. less than 1 is zooming in, more than 1 zooming out. xshift = 200 # Shift to right in map units. yshift = 50 # Shift to left in map units. original.bbox = border@bbox # Pass bounding box of your Spatial* Object. #Define the changes to map edges based on the above values edges = original.bbox edges[1,] <- (edges[1,] - mean(edges[1,])) * scale.parameter + mean(edges[1,]) + xshift edges[2,] <- (edges[2,] - mean(edges[2,])) * scale.parameter + mean(edges[2,]) + yshift #Define some map objects to stick on the image scale = list("SpatialPolygonsRescale", layout.scale.bar(), scale = 50, fill = c("transparent","black"), offset = c(1375, 575)) arrow = list("SpatialPolygonsRescale", layout.north.arrow(), offset = c(1500,810), scale = 25) text = list("sp.text", c(1400, 585), "50 km") #Create the plot #tiff(filename="SpatialEffect.tiff",width=6,height=5,units="in",res=300,pointsize=8,compression="lzw") spplot(border, "field", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], sp.layout=list(scale, arrow, text, florida)) grid.text("", x=unit(0.99, "npc"), y=unit(0.50, "npc"), rot=90) #dev.off() rm(grid.proj, projgrid, plotdata, all) #Save some memory #Check the predictive power of the model on 20% held-out observations, and on most recent predictions responses<-result3$summary.fitted.values$mean[1:nrow(data_avg)] holdout.vals<-responses[(nrow(dat.eval)+1):(nrow(data_avg)-nrow(dat.pred))] #predicted values from the index numbers of the val data (recall that validation holdouts were appended between eval and preds) holdout.obs<-dat.val$Bloom prediction.vals<-responses[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] #predicted values from the index numbers of the pred data prediction.vals.025 <- result3$summary.fitted.values$`0.025quant`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] #Obtain the 95% credible interval for predictions prediction.vals.975 <- result3$summary.fitted.values$`0.975quant`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] prediction.vals.sd<-result3$summary.fitted.values$`sd`[(nrow(dat.eval)+nrow(dat.val)+1):nrow(data_avg)] prediction.vals.se<-prediction.vals.sd/sqrt(103) prediction.obs <- dat.pred$Bloom #Create an ROC curve, determine the optimal cutoff point using Youden's index, and then calculate AUC value with optimal cutoff roc.val<-prediction(holdout.vals, holdout.obs) roc.perf.val<-performance(roc.val, measure="tpr", x.measure="fpr") #"tpr" means true positive rate, "fpr" is false positive rate roc.pred<-prediction(prediction.vals, prediction.obs) roc.perf.pred<-performance(roc.pred, measure="tpr", x.measure="fpr") #"tpr" means true positive rate, "fpr" is false positive rate #This function finds the optimal cutoff point according to Youden's index (this balances type I and II errors while attempting to minimize both) opt.cut = function(perf, val){ cut.ind = mapply(FUN=function(x, y, p){ d = (x - 0)^2 + (y-1)^2 ind = which(d == min(d)) c(sensitivity = y[[ind]], specificity = 1-x[[ind]], cutoff = p[[ind]]) }, perf@x.values, perf@y.values, val@cutoffs) } cutoff <- opt.cut(roc.perf.val, roc.val) cutoff[3] #Prints the optimal cutoff point ... #0.2581 OLCI, 0.29 MERIS #Obtain the AUC value auc.perf.val = performance(roc.val, measure = "auc") auc.perf.pred = performance(roc.pred, measure = "auc") #auc.perf.val@y.values #auc.perf.pred@y.values #Plot ROC curve with optimal cutoff tiff(filename="ROCRFig.tiff",width=3,height=3,units="in",res=300,pointsize=8,compression="lzw") plot(roc.perf.val, colorize=T, lwd=2, lty=1, cex.lab=1.2, colorkey.pos="right") points(x=1.01-cutoff[2], y=cutoff[1], cex=2, pch=20, col="black") #0.01 is added to the x coordinate to scoot the point a little over so it's centered on the line abline(a=0,b=1) dev.off() #Make a confusion matrix with our selected optimal cutoff point confmatrix.val<-as.matrix(table(factor(holdout.vals>=cutoff[3], levels=c(F,T)), holdout.obs)) confmatrix.pred<-as.matrix(table(factor(prediction.vals>=cutoff[3], levels=c(F,T)), prediction.obs)) confmatrix.val confmatrix.pred spec.val<-confmatrix.val[2,2]/(confmatrix.val[1,2]+confmatrix.val[2,2]) sens.val<-confmatrix.val[1,1]/(confmatrix.val[1,1]+confmatrix.val[2,1]) acc.val<-(confmatrix.val[2,2]+confmatrix.val[1,1])/(confmatrix.val[1,2]+confmatrix.val[2,2]+confmatrix.val[1,1]+confmatrix.val[2,1]) auc.perf.val@y.values spec.val sens.val acc.val spec.pred<-confmatrix.pred[2,2]/(confmatrix.pred[1,2]+confmatrix.pred[2,2]) sens.pred<-confmatrix.pred[1,1]/(confmatrix.pred[1,1]+confmatrix.pred[2,1]) acc.pred<-(confmatrix.pred[2,2]+confmatrix.pred[1,1])/(confmatrix.pred[1,2]+confmatrix.pred[2,2]+confmatrix.pred[1,1]+confmatrix.pred[2,1]) auc.perf.pred@y.values spec.pred sens.pred acc.pred m<-matrix(ncol=2,nrow=4) colnames(m)<-c("Holdout", "Prediction") row.names(m)<-c("AUC","Spec","Sens","ACC") m[1,1]<-auc.perf.val@y.values[[1]] m[1,2]<-auc.perf.pred@y.values[[1]] m[2,1]<-spec.val m[2,2]<-spec.pred m[3,1]<-sens.val m[3,2]<-sens.pred m[4,1]<-acc.val m[4,2]<-acc.pred #Visualize the most-recent-week predictions by plotting the logistic odds of a 100,000+ cells/mL bloom prediction.comids <- data.frame(COMID = dat.pred$COMID, Bloom = prediction.vals, Bloom.025 = prediction.vals.025, Bloom.975 = prediction.vals.975) prediction.comids2 <- data.frame(COMID = dat.pred$COMID, Bloom = prediction.vals, Bloom.SE = prediction.vals.se,Bloom.SD=prediction.vals.sd,OBS=prediction.obs) border <- merge(border, prediction.comids2, by='COMID', all=T) #Append the prediction value to each polygon in the 'border' shapefile #Plot the predictions tiff(filename="PredictionMap.tiff",width=18,height=5,units="in",res=300,pointsize=8,compression="lzw") # grid.arrange( # spplot(border, "Bloom.025", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="0.025 Quantile", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], # sp.layout=list(scale, arrow, text, florida)), # spplot(border, "Bloom", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="Odds of a Severe Bloom\n5/27/19 - 6/2/19", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], # sp.layout=list(scale, arrow, text, florida)), # spplot(border, "Bloom.975", col.regions = brewer.pal(n=9, name ="BuGn"), cuts=8, main="0.975 Quantile", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], # sp.layout=list(scale, arrow, text, florida)), # ncol=3) ########Plot predictions and SD #################################### grid.arrange( spplot(border, "Bloom", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], sp.layout=list(scale, arrow, text, florida)), spplot(border, "Bloom.SD", col.regions = rev(brewer.pal(n=9, name ="Spectral")), cuts=8, main="", scales=list(draw=F), xlim = edges[1,], ylim = edges[2,], sp.layout=list(scale, arrow, text, florida)), ncol=2) grid.text("Spatial Effect (log-odds)", x=unit(0.99, "npc"), y=unit(0.50, "npc"), rot=90) #Include this if you want, I think it's redundant dev.off() #To aid in interpreting LONG and LAT in result1, whose units are standard deviations of distance from the centroid of all observations, plot a map centerpoint <- data.frame(LONG = attributes(data_avg$LONG)$'scaled:center', LAT = attributes(data_avg$LAT)$'scaled:center') coordinates(centerpoint) <- ~LONG+LAT proj4string(centerpoint) <- crs(statelines) #tiff(filename="coordinate_sd_illustration.tiff",width=4,height=4,units="in",res=300,pointsize=8,compression="lzw") plot(statelines[statelines@data$STATE_NAME=="Florida",]) plot(centerpoint, add=TRUE) #Add segments that represent 1 unit distance for LONG and LAT coefficients segments(x0=rep(centerpoint@coords[1], times=2), y0=rep(centerpoint@coords[2], times=2), x1=c(centerpoint@coords[1] + attributes(data_avg$LONG)$'scaled:scale', centerpoint@coords[1]), y1=c(centerpoint@coords[2], centerpoint@coords[2] + attributes(data_avg$LAT)$'scaled:scale'), col="blue", lwd=3) #dev.off()