# this script helps grab discharge data from Miller et al 2018 JAWRA published data set
# Data download comes from:
#   https://www.sciencebase.gov/catalog/item/59cbbd61e4b017cf314244e1
#
# To determine which zip files to download, use an EPA Level 3 Ecoregion map or
# download the ecoregion spatial data (shapefile) and identify using those level three
# ecoregion naming codes (##.##.##) when paired with your own study system data.
# Level three ecoregion data can be located here:
#   https://www.epa.gov/eco-research/level-iii-and-iv-ecoregions-continental-united-states

# Load necessary data manipulation libraries
library(tidyverse)
library(magrittr)

# head directory holds all the Miller et al data downloaded and a .csv file with the
# list of NHDPlus V2 COMIDs that need discharge data. This script uses the following
# directory structure to access and manipulate the discharge data:
#
# 1. "data_manipulation_protocol" - (Head Directory) holds all other directories/files
#     1A. "csv" - (Miller et al data downloaded/extracted to here; lots of csv files)
#     1B. "input" - this directory holds the csv file with list of COMIDs to extract
#     1C. "output" - this directory holds the output summary file for the Q data

# identify head directory from base drive (e.g., "L:" drive here)
head_dir <-
   "L:/Priv/Region10RARE/GIS/discharge/Miller_etal_2018_data_manipulation_protocol"

# read in the csv file with the NHDPlus V2 COMIDs to extract from Miller et al data
# This file should be one column with the first row named "COMID"
#  Just replace "NAME_OF_TABLE_WITH_COMIDS" with the name of your table
COMID_ls <- read_csv(file=paste0(head_dir,"/input/","NAME_OF_TABLE_WITH_COMIDS",".csv"))


# Generate a list of all discharge csv files downloaded -> extracted to csv
Q_file_ls <- list.files(paste0(head_dir,"/csv"), full.names = TRUE, recursive = TRUE)

# grab the file names that match the list of COMIDs for each site from the join tables
matching_Q_file_names <-
   Q_file_ls[unlist(lapply(X=COMID_ls$COMID,
                           FUN= function(x){grep(pattern=x, x=Q_file_ls)}))]

# if your list of COMIDs has any duplicates, this line will remove those so your not
# downloading and extracting the same file multiple times
matching_Q_file_names_nodup <- matching_Q_file_names[!duplicated(matching_Q_file_names)]

# Import the csv files for each COMID
dat <- bind_rows(lapply(X=matching_Q_file_names_nodup,FUN=read_csv))

# ***** COLUMN NAMES AND DESCRIPTIONS FROM DATA IMPORT *****
# "COMID" = NHDPlus Version 2 COMID
# "AREA" = Flowline upstream watershed area from downstream flowline node (km^2)
# "Year" = year of discharge estimate
# "P10" = the 10th percentile of the random forest model's ensemble of 1000 predictions (cfs per km^2)
# "P50" = the 50th percentile of the random forest model's ensemble of 1000 predictions (cfs per km^2)
# "P90" = the 90th percentile of the random forest model's ensemble of 1000 predictions (cfs per km^2)
# "MEAN" = average of the random forest model's 1000 predictions (cfs per km^2)
# "P10_Q" = P10 * AREA for a pure discharge value (cfs); lower bound of prediction interval
# "P50_Q" = P50 * AREA for a pure discharge value (cfs); represents the most frequently predicted discharge value.
# "P90_Q" = P90 * AREA for a pure discharge value (cfs); upper bound of prediction interval
# "Estimated.Q" = MEAN * AREA for the average discharge prediction from ensemble
# "Month" = month for prediction


# DATA FILTERING for year and month ranges; data unit coversions and skew test for CIs
(dat_filter <- dat %>%
      filter(Year >= 1990 & Year <= 2015) %>%
      filter(Month >= 5 & Month <= 9) %>%
      mutate(ci_test = if_else((P10 < MEAN & MEAN < P90),0,1),
             cmsQ = Estimated.Q*0.0283168,  # 1 cfs = 0.0283168 cms
             cmsQkm2 = MEAN*0.0283168) %>%  # 1 cms = 35.314666212661 cfs
             # USE THESE COMMENTED OUT WHEN THE 'ci_test' HAS 1S IN IT
             # cmsQ = P50_Q*0.0283168,  # 1 cfs = 0.0283168 cms
             # cmsQkm2 = P50*0.0283168) %>%  # 1 cms = 35.314666212661 cfs
      select(COMID,Year,Month,cmsQ,cmsQkm2,ci_test))  #


# ***** newly generated table column descriptions *****
# COMID = NHDPlus Version 2 COMID
# Year = year of discharge estimate
# Month = month for prediction
# cmsQ = raw discharge estimate in cubic meters per second
# cmsQkm2 = area normalized discharge estimate (cmsQ/AREA) in cms per km2
# ci_test = value of 0 or 1 where 0 indicates MEAN value is within the CI; this is a
#           data quality check. If you end up with 1s in this data column, you may
#           want to consider using the median prediction value instead of the mean.
#           If you go that route, use the two lines commented out in the above code
#           chunk/pipe that use P_50Q and P50 instead of the lines that use MEAN and
#           Estimated.Q (comment out MEAN and Estimated.Q). Below, a summary of the

#check the ci_test data to see if any 1s exist in the summary.
summary(dat_filter$ci_test)

# what a good summary looks like with no 1s
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0       0       0       0       0       0

# What a summmary might look like if there are any 1s in the ci_test column
# Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
# 0.0000000 0.0000000 0.0000000 0.0007692 0.0000000 1.0000000


# Save the data as a new csv file for use in excel or other programs
# Just replace the 'OUTPUT_CSV_SUMMARY_FILE' with your name and the file will be
# written to the output directory
write_csv(dat_filter,path=paste0(head_dir,"/output/","OUTPUT_CSV_SUMMARY_FILE",".csv"))


