-
Notifications
You must be signed in to change notification settings - Fork 1
/
TS_Functions.R
128 lines (114 loc) · 7.18 KB
/
TS_Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#Check to make sure you have all the needed packages installed
packages <- c("RCurl", "reshape2", "zoo","maps","maptools","reshape2")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
#platLAGOSLT.data extracts timeseries from LAGOS data that meet criteria set in function, plots the data, and returns a dataframe of the
#median summer values for the lakes with associated time series. The algorithm will automatically select the time period that has the most lakes
#and if more than one time period has the same (max) number of lakes, it will select the most recent time period. Function returns
#a list with the data for the time period and a vector of number of lakes with LT data through time.
plotLAGOSLT.data = function(data,TSlength,miss.freq,locos,param.name,set.year=NULL){
#Data columns (1) lagoslakeid, (2) sampledate [format="%m/%d/%Y"], (3) parameter value
#TSlength is length of time series desired
#missing.freq is amount of missing data within each TS as a percent (e.g., 20)
#locos is LAGOS locus data table
#param.name is string with parameter name for plotting purposes
#set.year is numeric if you want to choose starting year for example to match up with another time series. default is
#to choose most recent and set.year = NULL
require(maps)
require(maptools)
require(reshape2)
data.complete = data[complete.cases(data[,1:3]),]
names(data.complete)[3] = "value"
data.complete$DoY = as.numeric(strftime(as.Date(data.complete$sampledate,format="%m/%d/%Y"), format = "%j"))
data.complete$Year = as.numeric(strftime(as.Date(data.complete$sampledate,format="%m/%d/%Y"), format = "%Y"))
data.complete = data.complete[which(data.complete$DoY>=166 & data.complete$DoY<=258),] #subset to samples collected in summer
data.complete = aggregate(data.complete$value,by=list(data.complete$lagoslakeid,data.complete$Year),FUN=mean) #estimate mean summer value
names(data.complete) = c("lagoslakeid","year","value")
years.data = range(data.complete$year)
years.data = data.frame(year=seq(from=years.data[1],to=years.data[2],by=1),temp=NA)
lakes = unique(data.complete$lagoslakeid)
for(i in 1:length(lakes)){
if(i == 1){
temp.data = data.complete[which(data.complete$lagoslakeid==lakes[i]),]
data.full = merge(temp.data,years.data,all=TRUE)
data.full$lagoslakeid = lakes[i]
data.full = data.full[,-4]
} else {
temp.data = data.complete[which(data.complete$lagoslakeid==lakes[i]),]
temp.data = merge(temp.data,years.data,all=TRUE)
temp.data$lagoslakeid = lakes[i]
temp.data = temp.data[,-4]
data.full = rbind(data.full,temp.data)
}
}
data.wide <- dcast(data.full, lagoslakeid ~ year,mean)
ts.out = TS.Length(data = data.wide,record.length = TSlength,missing.freq = miss.freq)
if(is.null(set.year)) start.year = as.numeric(names(ts.out)[which(ts.out==max(ts.out))[length(which(ts.out==max(ts.out)))]]) else start.year = set.year
years.data = seq(from=start.year,to=start.year+TSlength-1,by=1)
#subset data to years of interest with correct record length
data.full = data.full[which(data.full$year %in% years.data),]
data.wide <- dcast(data.full, lagoslakeid ~ year,median)
data.wide$recordlength = rowSums(!is.na(data.wide[,2:(TSlength+1)]))
data.wide = data.wide[which(data.wide$recordlength>=(TSlength-(TSlength*miss.freq/100))),]
data.wide = merge(data.wide,locos[,c("lagoslakeid","nhd_lat","nhd_long")])
map.regions = c('wisconsin','minnesota','vermont','maine','michigan','missouri','rhode island','new york','iowa','illinois','indiana','ohio','new hampshire','pennsylvania','connecticut','massachusetts','new jersey')
map('state',region=map.regions,col=grey(.98),fill=TRUE,resolution = 0,mar=c(0,0,0,0),border=grey(.5),lty=5)
points(data.wide$nhd_long,data.wide$nhd_lat,pch=16,col=rgb(0,0,0,.65),cex=.75)
mtext(side=3,line=-1,paste(param.name," LT Data (",range(years.data)[1],":",range(years.data)[2],")",sep=""))
mtext(side=3,line=-2,paste(nrow(data.wide)," Lakes w/ ",(TSlength-(TSlength*miss.freq/100))," - ",TSlength," yrs of data",sep=""))
mtext(side=3,line=-3,paste(TSlength," yr record w/ max of ",miss.freq,"% missing data",sep=""))
return(list(data = data.wide,numObs = ts.out))
}
#Example
#out = plotLAGOSLT.data(data=LAGOS_Limno[,c("lagoslakeid","sampledate","tp")],TSlength = 25,miss.freq = 20,locos = LAGOS_Lakes,param.name = "Total P")
#TS.Length function for identifying optimal/maximum time series and time periods of record based on two input criteria
#record.length is length or long-term record to be analyzed or vector of potential long-term record lengths
#missing.freq is the extent of missing data allowed (single value or vector of values) as percent of data (e.g., 10%)
TS.Length = function(data,record.length,missing.freq){
require(reshape2)
require(zoo)
#Misc functions needed
RL <- function(data){sum(!is.na(data))}
count.records = function(data){sum(data>=missing.freq.t[j])}
out.data = data.frame(matrix(NA, nrow=length(record.length), ncol=length(missing.freq)))
row.names(out.data) = record.length
colnames(out.data) = missing.freq
for(x in 1:length(record.length)){ #loop through record lengths
new.data = data.frame(matrix(NA, nrow=nrow(data), ncol=ncol(data)-record.length[x]+1))
for(i in 1:nrow(data)){ #calculate number of observations within each time period
new.data[i,1] = data[i,1]
x.temp = as.numeric(data[i,2:ncol(data)])
new.data[i,2:ncol(new.data)] = rollapply(x.temp,width = record.length[x],FUN=RL)
}
missing.freq.t = ceiling(record.length[x] - (missing.freq/100*record.length[x]))
summary.data = data.frame(matrix(NA, nrow=length(missing.freq.t), ncol=ncol(new.data)))
for(j in 1:length(missing.freq.t)){
summary.data[j,1] = missing.freq.t[j]
if(ncol(new.data)>2) summary.data[j,2:ncol(summary.data)]=apply(new.data[,2:ncol(new.data)],2,count.records) else {
summary.data[j,2:ncol(summary.data)] = count.records(new.data[,2])
}
}
names(summary.data)[2:ncol(summary.data)] = names(data[2:ncol(summary.data)])
names(summary.data)[1] = "min_rec_len"
for(z in 1:nrow(summary.data)){
out.data[x,z] = max(summary.data[z,2:ncol(summary.data)])
}
}
if(length(record.length)==1){
return(summary.data)
break()
}
matplot(out.data,type="l",xlab="Maximum Years of Data",xaxt="n",ylab="Number of Lakes")
axis(side=1,at=c(1:nrow(out.data)),labels=row.names(out.data))
legend('topright',legend=missing.freq,lwd=2,col=c(1:length(missing.freq)),title="Freq. of Missing Data (%)")
return(out.data)
}
#Example of running code
#Example data file take from Oliver et al. 2017 (chla data)
#require(Rcurl)
#dat = read.csv(text=getURL("https://raw.githubusercontent.com/nrlottig/NRL_Functions/master/datafiles/data_optimize_ts_window.csv"), header=T)
#temp=TS.Length(data = dat,record.length = c(10:24),missing.freq = c(10,20,30,40))
#Run with a single length of time to extract table of when to start time series
#temp=TS.Length(data = dat,record.length = 20,missing.freq = c(10,20,30,40))
#Look at output file. 530 lakes have 14-20 years of data (30% max missing data) starting with 1994.