# R code for tables and figures of the manuscript "Measuring urban tree loss 
# dynamics across US residential landscapes" by Alessandro Ossola, Matthew Hopton, 
# US EPA/ORD/NRMRL/ORD/STD, 26 W. Martin Luther King Dr., Cincinnati, OH, 45268, USA.
# Corresponding Author, Email: hopton.matthew@epa.gov, T: +1 513 569 7718
# version 2/27/2017

# data from "DataDenverMilwaukee.xlsx" file "toR_subset_FINAL" tab
# data<-read.table("clipboard", header=T)
# attach(data)

# dataset preparation
.libPaths("C:/users/aossola/R/library")
den<-subset(data, City=="Denver" & MEANheight>0, select=c(Age:NcutStemsResAREA))
mil<-subset(data, City=="Milwaukee" & MEANheight>0, select=c(Age:NcutStemsResAREA))

# checking variable multicollinearity using Variance Inflation Factors (VIF) and excluding variables which have
# a correlation higher than ~0.67 (VIF threshold =3). Denver (den) and 
# Milwaukee (mil) data are run separately.
library(usdm)
den.MC<-subset(den, select=c(Age:PercentRes))
den.MC = data.frame(den.MC)
vif(den.MC)
vifstep(den.MC, th=3)
mil.MC<-subset(mil, select=c(Age:PercentRes))
mil.MC = data.frame(mil.MC)
vif(mil.MC)
vifstep(mil.MC, th=3)

# bringing along only the non collinear variables from VIF selections.
DEN<-subset(data, City=="Denver" & MEANheight>0, select=c(MEANheight, NcutStemsResAREA, Age , 
      FAMnonRatio, VacantUnits., BelowPoverty, Gini, Unemployed20_64, Rent, Decade, TcanAREAstart, ParcelSize, HousingDensity, PercentRes))
MIL<-subset(data, City=="Milwaukee" & MEANheight>0, select=c(MEANheight, NcutStemsResAREA,Age , 
	RaceDiv, FAMnonRatio, VacantUnits., Gini, Unemployed20_64, Rent, Decade, TcanAREAstart, HousingDensity, PercentRes))

# check dimensions datasets
dim(DEN)
dim(MIL)

####### running DENVER dataset ###################################################

library(lmtest) # bptest for testing heteroscedasticity
library(gvlma)  # global validation of linear model assumptions

####### linear models predicting n. cut stems per census tract (DEN$NcutStemsResAREA)
#global model with outliers
lm.den.N.all<-lm(sqrt(DEN$NcutStemsResAREA)~ log(DEN$Age+1) + log(DEN$FAMnonRatio+1) +
            log(DEN$VacantUnits.+1) + sqrt(DEN$BelowPoverty) + DEN$Gini + sqrt(DEN$Unemployed20_64) + 
            log(DEN$Rent+1) + (DEN$Decade) + (DEN$TcanAREAstart) + log(DEN$ParcelSize+1) + log(DEN$HousingDensity+1) +
            (DEN$PercentRes))
summary(lm.den.N.all)
plot(lm.den.N.all, which=2)
shapiro.test(residuals(lm.den.N.all))
gvlma(lm.den.N.all)

#removing outliers (obs 79 and 83) which have the highest N cut stems
DEN[c(79, 83), 1:12]
DEN1<-DEN[-c(79, 83),]

#global model without ouliers
lm.den.N.all1<-lm(sqrt(DEN1$NcutStemsResAREA)~ log(DEN1$Age+1) + log(DEN1$FAMnonRatio+1) +
            log(DEN1$VacantUnits.+1) + sqrt(DEN1$BelowPoverty) + DEN1$Gini + sqrt(DEN1$Unemployed20_64) + 
            log(DEN1$Rent+1) + (DEN1$Decade) + (DEN1$TcanAREAstart) + log(DEN1$ParcelSize+1) + log(DEN1$HousingDensity+1) +
            (DEN1$PercentRes))
summary(lm.den.N.all1)
plot(lm.den.N.all1, which=2)
shapiro.test(residuals(lm.den.N.all1))
gvlma(lm.den.N.all1)

#urban morphology model without ouliers
lm.den.N.env1<-lm(sqrt(DEN1$NcutStemsResAREA)~ (DEN1$Decade) + (DEN1$TcanAREAstart) + log(DEN1$ParcelSize+1) + log(DEN1$HousingDensity+1) +
            (DEN1$PercentRes))
summary(lm.den.N.env1)
plot(lm.den.N.env1, which=2)
shapiro.test(residuals(lm.den.N.env1))
gvlma(lm.den.N.env1)

#socio-eco model without ouliers
lm.den.N.socio1<-lm(sqrt(DEN1$NcutStemsResAREA)~ log(DEN1$Age+1) + log(DEN1$FAMnonRatio+1) +
            log(DEN1$VacantUnits.+1) + sqrt(DEN1$BelowPoverty) + DEN1$Gini + sqrt(DEN1$Unemployed20_64) + 
            log(DEN1$Rent+1))
summary(lm.den.N.socio1)
plot(lm.den.N.socio1, which=1)
shapiro.test(residuals(lm.den.N.socio1))
gvlma(lm.den.N.socio1)

### selecting best model based on AICc
library(AICcmodavg)
Cand.models<-list()
Cand.models[[1]]<-lm.den.N.all1
Cand.models[[2]]<-lm.den.N.env1
Cand.models[[3]]<-lm.den.N.socio1
aictab(cand.set = Cand.models, sort = TRUE)

# best model is lm.den.N.env1

####### linear models predicting stem height in each census tract (MEANheight)
DEN<-subset(data, City=="Denver" & MEANheight>0, select=c(MEANheight, NcutStemsResAREA, Age , 
      FAMnonRatio, VacantUnits., BelowPoverty, Gini, Unemployed20_64, Rent, Decade, TcanAREAstart, ParcelSize, HousingDensity, PercentRes))

#global model with outliers
lm.den.H.all<-lm(DEN$MEANheight ~ log(DEN$Age+1) + log(DEN$FAMnonRatio+1) +
            log(DEN$VacantUnits.+1) + sqrt(DEN$BelowPoverty) + DEN$Gini + sqrt(DEN$Unemployed20_64) + 
            log(DEN$Rent+1) + (DEN$Decade) + (DEN$TcanAREAstart) + log(DEN$ParcelSize+1) + log(DEN$HousingDensity+1) +
            (DEN$PercentRes))
summary(lm.den.H.all)
plot(lm.den.H.all, which=2)
shapiro.test(residuals(lm.den.H.all))
gvlma(lm.den.H.all)

#Removed outliers 105, 157, 170, which have the highest height

DEN[c(105, 157, 170), 1:12]
DEN1<-DEN[-c(105, 157, 170),]

#global model without outliers
lm.den.H.all1<-lm(DEN1$MEANheight ~ log(DEN1$Age+1) + log(DEN1$FAMnonRatio+1) +
            log(DEN1$VacantUnits.+1) + sqrt(DEN1$BelowPoverty) + DEN1$Gini + sqrt(DEN1$Unemployed20_64) + 
            log(DEN1$Rent+1) + (DEN1$Decade) + (DEN1$TcanAREAstart) + log(DEN1$ParcelSize+1) + log(DEN1$HousingDensity+1) +
            (DEN1$PercentRes))
summary(lm.den.H.all1)
plot(lm.den.H.all1, which=2)
shapiro.test(residuals(lm.den.H.all1))
gvlma(lm.den.H.all1)

#urban morphology model without outliers
lm.den.H.env1<-lm(DEN1$MEANheight ~ (DEN1$Decade) + (DEN1$TcanAREAstart) + log(DEN1$ParcelSize+1) + log(DEN1$HousingDensity+1) +
            (DEN1$PercentRes))
summary(lm.den.H.env1)
plot(lm.den.H.env1, which=2)
shapiro.test(residuals(lm.den.H.env1))
gvlma(lm.den.H.env1)

#socio-eco model without outliers
lm.den.H.socio1<-lm(DEN1$MEANheight ~ log(DEN1$Age+1) + log(DEN1$FAMnonRatio+1) +
            log(DEN1$VacantUnits.+1) + sqrt(DEN1$BelowPoverty) + DEN1$Gini + sqrt(DEN1$Unemployed20_64) + 
            log(DEN1$Rent+1))
summary(lm.den.H.socio1)
plot(lm.den.H.socio1, which=2)
shapiro.test(residuals(lm.den.H.socio1))
gvlma(lm.den.H.socio1)

library(AICcmodavg)
Cand.models<-list()
Cand.models[[1]]<-lm.den.H.all1
Cand.models[[2]]<-lm.den.H.env1
Cand.models[[3]]<-lm.den.H.socio1
aictab(cand.set = Cand.models, sort = TRUE)

# best model is lm.den.H.all1

####### running MILWAUKEE dataset ###################################################
library(lmtest) # bptest for testing heteroscedasticity
library(gvlma)  # global validation of linear model assumptions

####### linear models predicting n. cut stems per census tract (DEN$NcutStemsResAREA)

#global model with outliers
lm.mil.N<-lm(log(MIL$NcutStemsResAREA+1)~ log(MIL$Age+1) + log(MIL$RaceDiv+1) +
            log(MIL$FAMnonRatio+1) + log(MIL$VacantUnits.+1) + log(MIL$Gini+1) + log(MIL$Unemployed20_64+1) + 
            log(MIL$Rent+1) + (MIL$Decade) + log(MIL$TcanAREAstart+1)+ log(MIL$HousingDensity+1) + (MIL$PercentRes))
summary(lm.mil.N)
plot(lm.mil.N, which=2)
shapiro.test(residuals(lm.mil.N))
gvlma(lm.mil.N)
#bptest(lm.mil.N)

# Removed outliers (obs 72,108,118) which have the highest and lowest N cut stems
MIL[c(72,108,118), 1:12]
MIL1<-MIL[-c(72,108,118),]

#global model without outliers
lm.mil.N.all1<-lm(log(MIL1$NcutStemsResAREA+1)~ log(MIL1$Age+1) + log(MIL1$RaceDiv+1) +
            log(MIL1$FAMnonRatio+1) + log(MIL1$VacantUnits.+1) + log(MIL1$Gini+1) + log(MIL1$Unemployed20_64+1) + 
            log(MIL1$Rent+1) + (MIL1$Decade) + log(MIL1$TcanAREAstart+1)+ log(MIL1$HousingDensity+1) + (MIL1$PercentRes))
summary(lm.mil.N.all1)
plot(lm.mil.N.all1, which=2)
shapiro.test(residuals(lm.mil.N.all1))
gvlma(lm.mil.N.all1)

#urban morphology model without outliers
lm.mil.N.env1<-lm(log(MIL1$NcutStemsResAREA+1)~ (MIL1$Decade) + log(MIL1$TcanAREAstart+1)+ log(MIL1$HousingDensity+1) + 
                  (MIL1$PercentRes))
summary(lm.mil.N.env1)
plot(lm.mil.N.env1, which=2)
shapiro.test(residuals(lm.mil.N.env1))
gvlma(lm.mil.N.env1)

#socio-eco model without outliers
lm.mil.N.socio1<-lm(log(MIL1$NcutStemsResAREA+1)~ log(MIL1$Age+1) + log(MIL1$RaceDiv+1) +
            log(MIL1$FAMnonRatio+1) + log(MIL1$VacantUnits.+1) + log(MIL1$Gini+1) + log(MIL1$Unemployed20_64+1) + 
            log(MIL1$Rent+1))
summary(lm.mil.N.socio1)
plot(lm.mil.N.socio1, which=2)
shapiro.test(residuals(lm.mil.N.socio1))
gvlma(lm.mil.N.socio1)

Cand.models<-list()
Cand.models[[1]]<-lm.mil.N.all1
Cand.models[[2]]<-lm.mil.N.env1
Cand.models[[3]]<-lm.mil.N.socio1
aictab(cand.set = Cand.models, sort = TRUE)

# best model is lm.den.H.all1

####### linear models predicting stem height in each census tract (MEANheight)
MIL<-subset(data, City=="Milwaukee" & MEANheight>0, select=c(MEANheight, NcutStemsResAREA,Age , 
	RaceDiv, FAMnonRatio, VacantUnits., Gini, Unemployed20_64, Rent, Decade, TcanAREAstart, HousingDensity, PercentRes))

lm.mil.H<-lm((MIL$MEANheight)~ log(MIL$Age+1) + log(MIL$RaceDiv+1) +
            log(MIL$FAMnonRatio+1) + log(MIL$VacantUnits.+1) + log(MIL$Gini+1) + log(MIL$Unemployed20_64+1) + 
            log(MIL$Rent+1) + (MIL$Decade) + log(MIL$TcanAREAstart+1)+ log(MIL$HousingDensity+1) + (MIL$PercentRes))
summary(lm.mil.H)
plot(lm.mil.H, which=2)
shapiro.test(residuals(lm.mil.H))
gvlma(lm.mil.H)
#bptest(lm.mil.H)

# Removed outliers (obs 41,108,118) which have the highest and lowest N cut stems
MIL[c(41,108,118), 1:12]
MIL1<-MIL[-c(41,108,118),]

#global model without outliers
lm.mil.H.all1<-lm((MIL1$MEANheight)~ log(MIL1$Age+1) + log(MIL1$RaceDiv+1) +
            log(MIL1$FAMnonRatio+1) + log(MIL1$VacantUnits.+1) + log(MIL1$Gini+1) + log(MIL1$Unemployed20_64+1) + 
            log(MIL1$Rent+1) + (MIL1$Decade) + log(MIL1$TcanAREAstart+1)+ log(MIL1$HousingDensity+1) + (MIL1$PercentRes))
summary(lm.mil.H.all1)
plot(lm.mil.H.all1, which=2)
shapiro.test(residuals(lm.mil.H.all1))
gvlma(lm.mil.H.all1)

#environmental model without outliers
lm.mil.H.env1<-lm((MIL1$MEANheight)~ (MIL1$Decade) + log(MIL1$TcanAREAstart+1)+ log(MIL1$HousingDensity+1) + 
                  (MIL1$PercentRes))
summary(lm.mil.H.env1)
plot(lm.mil.H.env1, which=2)
shapiro.test(residuals(lm.mil.H.env1))
gvlma(lm.mil.H.env1)

#socio-eco model without outliers
lm.mil.H.socio1<-lm((MIL1$MEANheight)~ log(MIL1$Age+1) + log(MIL1$RaceDiv+1) +
            log(MIL1$FAMnonRatio+1) + log(MIL1$VacantUnits.+1) + log(MIL1$Gini+1) + log(MIL1$Unemployed20_64+1) + 
            log(MIL1$Rent+1))
summary(lm.mil.H.socio1)
plot(lm.mil.H.socio1, which=2)
shapiro.test(residuals(lm.mil.H.socio1))
gvlma(lm.mil.H.socio1)

library(AICcmodavg)
Cand.models<-list()
Cand.models[[1]]<-lm.mil.H.all1
Cand.models[[2]]<-lm.mil.H.env1
Cand.models[[3]]<-lm.mil.H.socio1
aictab(cand.set = Cand.models, sort = TRUE)

# best model is lm.mil.H.all1


####### code for producing Fig.4 ###################################################################
library(ggplot2)

den<-subset(data, City=="Denver")
mil<-subset(data, City=="Milwaukee")

dev.new(width=20, height=17)
p <- ggplot(data = data, aes(y = NcutStemsResAREA, x = TvolAREAstart, fill=City))
p + geom_point(aes(fill = factor(City)), shape=21, alpha=0.3, size = 4) + 
    scale_fill_manual(values=c("red", "cyan")) + geom_smooth(aes(color=City), method=lm, se=F) +
    theme_bw() + ylab(bquote('Tree stems removed (' *km^2*')'))   + 
    xlab(bquote('Vegetation volume ('*m^3~m^-2*') initial year'))  + ggtitle (" ") 

lmNvol.den<-lm((den$NcutStemsResAREA)~den$TvolAREAstart)
summary(lmNvol.den)
plot(lmNvol.den)
lmNvol.mil<-lm((mil$NcutStemsResAREA)~(mil$TvolAREAstart))
summary(lmNvol.mil)
plot(lmNvol.mil)

#
den<-subset(data, City=="Denver")
mil<-subset(data, City=="Milwaukee")

dev.new(width=20, height=17)
p <- ggplot(data = data, aes(y = NcutStemsResAREA, x = TcanAREAstart, fill=City))
p + geom_point(aes(fill = factor(City)), shape=21, alpha=0.3, size = 4) + 
    scale_fill_manual(values=c("red", "cyan")) + geom_smooth(aes(color=City), method=lm, se=FALSE) +
    theme_bw() + ylab(bquote('Tree stems removed (' *km^2*')'))   + 
    xlab(bquote('Canopy cover ('*m^2~m^-2*') initial year'))  + ggtitle (" ") 

lmNcan.den<-lm((den$NcutStemsResAREA)~den$TcanAREAstart)
summary(lmNcan.den)
plot(lmNcan.den)
lmNcan.mil<-lm((mil$NcutStemsResAREA)~(mil$TcanAREAstart))
summary(lmNcan.mil)
plot(lmNcan.mil)

####### code for producing Fig.5 ###################################################################
library(ggplot2)
####################NcutStems~age Figure 5A
myData <- aggregate(data$NcutStemsResAREA,
    by = list(Decade = Decade, City),
    FUN = function(x) c(mean = mean(x), sd = sd(x),
                        n = length(x)))
myData <- do.call(data.frame, myData)
myData$se <- myData$x.sd / sqrt(myData$x.n)
colnames(myData) <- c("Decade", "City", "mean", "sd", "n", "se")
myData$labels <- c("1940s or earlier", "1950s or earlier", "1960s or earlier", "1970s or earlier", 
     "1980s or earlier", "1990s or earlier", "2000s or earlier", "2005s or later", "1940s or earlier", 
     "1950s or earlier", "1960s or earlier", "1970s or earlier", "1980s or earlier", "1990s or earlier")

dodge <- position_dodge(width = 0.9)
limits <- aes(ymax = myData$mean + myData$se,
              ymin = myData$mean - myData$se)

dev.new(width=30, height=17)
p <- ggplot(data = myData, aes(x = labels, y = mean, fill=City))

p + geom_bar(stat = "identity", col="black", position=position_dodge(), size=.3) +
  geom_errorbar(limits, position = dodge, width = 0.25) +
  scale_fill_manual(values=alpha(c("red", "cyan"), 0.2)) +
  theme_bw() + xlab("Decade of maximum housing development") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab(bquote('Tree stems removed (' *km^2*')'))  + ylim(0,170) +
  ggtitle (" ") +
  #ggtitle("Number of tree stems removed") + 
  geom_text(aes(label=myData$n), position=position_dodge(width=0.9), vjust=-6, size=2) 

####################MEANheight~age Figure 5B
myData <- aggregate(data$MEANheight,
    by = list(Decade = Decade, City),
    FUN = function(x) c(mean = mean(x), sd = sd(x),
                        n = length(x)))
myData <- do.call(data.frame, myData)
myData$se <- myData$x.sd / sqrt(myData$x.n)
colnames(myData) <- c("Decade", "City", "mean", "sd", "n", "se")
myData$labels <- c("1940s or earlier", "1950s or earlier", "1960s or earlier", "1970s or earlier", 
     "1980s or earlier", "1990s or earlier", "2000s or earlier", "2005s or later", "1940s or earlier", 
     "1950s or earlier", "1960s or earlier", "1970s or earlier", "1980s or earlier", "1990s or earlier")

dodge <- position_dodge(width = 0.9)
limits <- aes(ymax = myData$mean + myData$se,
              ymin = myData$mean - myData$se)

dev.new(width=30, height=17)
p <- ggplot(data = myData, aes(x = labels, y = mean, fill=City))

p + geom_bar(stat = "identity", col="black", position=position_dodge(), size=.3) +
  geom_errorbar(limits, position = dodge, width = 0.25) +
  scale_fill_manual(values=alpha(c("red", "cyan"), 0.2)) +
  theme_bw() + xlab("Decade of maximum housing development") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab("Tree stem height (m)") + ylim(0,15) +
  ggtitle("Average stem height of trees removed") #+ 
  #geom_text(aes(label=myData$n), position=position_dodge(width=0.9), vjust=-6, size=2) 

####### code for producing Figure 3 #####################################################################

# data from "DataDenverMilwaukee.xlsx" file "treecut_all" tab
# data<-read.table("clipboard", header=T)
# attach(data)
library(ggplot2)

#now make your lovely plot
ggplot(data, aes(Height_m, fill = City)) + geom_density(alpha = 0.2) +
	xlim(0,25) + xlab("Height (m)") + ylab("Frequency") + theme_bw()

####### code for producing Appendix B ###################################################################

# data from "DataDenverMilwaukee.xlsx" file "CalibrationDEMsDSMs" tab
# data<-read.table("clipboard", header=T)
# attach(data)
library(scales)

par(mfrow=c(2,2))
plot(DEM2008, DEM2013, xlab="DEM 2008", ylab="DEM 2013", main="Denver", col=alpha("red", 0.3), pch=16, cex=1.4)
cor.test(DEM2008, DEM2013)
text(1640, 1740, expression(paste(rho, "=0.999, p<0.001")))

plot(nDSM2008, nDSM2013, xlab="nDSM 2008", ylab="nDSM 2013", main="Denver", xlim=c(3,14), ylim=c(3,14), col=alpha("red", 0.3), pch=16, cex=1.4)
cor.test(nDSM2008, nDSM2013)
text(7, 13, expression(paste(rho, "=0.986, p<0.001")))

plot(DEM2010, DEM2015, xlab="DEM 2010", ylab="DEM 2015", main="Milwaukee", col=alpha("cyan", 0.3), pch=16, cex=1.4)
cor.test(DEM2010, DEM2015)
text(205,245, expression(paste(rho, "=0.999, p<0.001")))

plot(nDSM2010, nDSM2015, xlab="nDSM 2010", ylab="nDSM 2015", main="Milwaukee", xlim=c(3,14), ylim=c(3,14), col=alpha("cyan", 0.3), pch=16, cex=1.4)
cor.test(nDSM2010, nDSM2015)
text(7,13, expression(paste(rho, "=0.976, p<0.001")))