-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAutomated datacleaning using constructed function.R
140 lines (66 loc) · 2.98 KB
/
Automated datacleaning using constructed function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#Automated datacleaning using constructed functions from last years project.
#
#
# Save cleaned data in .rds format.
#
#
#
library(dataCleanFunc)
library(highfrequency)
setwd("C:/Users/emil7/Dropbox/Uni - Mathematics and Economics/Speciale - Master Thesis/Data cleaned/TLT/Merged files")
#TLT until 20120515 (due to corrupt file)
data1 <- read.csv("MasterData - until 20120515 (not included).csv", header = T)
cleanData1 <- nanRemover(data1)
dates1 <- getDates(data1)
timestamps1 <- timestamp(cleanData1, dates1)
timeseriesTLT1 <- timeseriesList(data1, cleanData1, timestamps1)
#free up some RAM.
rm(data1, dates1, timestamps1)
#TLT 20120515 - 20150910 (due to corrupt file) - have to limit data2 manually to remove NA columns produced by error.
data2 <- read.csv("MasterData - from 20120515 till 20150910 (not included).csv", header = T)[,1:1670]
cleanData2 <- nanRemover(data2)
dates2 <- getDates(data2)
timestamps2 <- timestamp(cleanData2, dates2)
timeseriesTLT2 <- timeseriesList(data2, cleanData2, timestamps2)
rm(data2, dates2, timestamps2)
#TLT 20150910 - 20191231
data3 <- read.csv("MasterData.csv", header = T)
cleanData3 <- nanRemover(data3)
dates3 <- getDates(data3)
timestamps3 <- timestamp(cleanData3, dates3)
timeseriesTLT3 <- timeseriesList(data3, cleanData3, timestamps3)
rm(data3, dates3, timestamps3)
#--------------collecting all of the list into one--------------------------------
t1 <- c(timeseriesTLT1, timeseriesTLT2)
dataTLT <- c(t1, timeseriesTLT3)
saveRDS(dataTLT, "dataTLT.rds")
#-------------------------------SPY-----------------------------------------------
setwd("C:/Users/emil7/Dropbox/Uni - Mathematics and Economics/Speciale - Master Thesis/Data cleaned/SPY/Merged files")
#SPY until 20150512 (due to corrupt file)
data1 <- read.csv("MasterData until 20150512 (not included).csv", header = T)
cleanData1 <- nanRemover(data1)
dates1 <- getDates(data1)
timestamps1 <- timestamp(cleanData1, dates1)
timeseriesSPY1 <- timeseriesList(data1, cleanData1, timestamps1)
#free up some RAM.
rm(data1, dates1, timestamps1)
data2 <- read.csv("MasterData from 20150512 till 20161130 (not included).csv", header = T)
cleanData2 <- nanRemover(data2)
dates2 <- getDates(data2)
timestamps2 <- timestamp(cleanData2, dates2)
timeseriesSPY2 <- timeseriesList(data2, cleanData2, timestamps2)
rm(data2, dates2, timestamps2)
data3 <- read.csv("MasterData from 20161130 until 20190517 (not included).csv", header = T)
cleanData3 <- nanRemover(data3)
dates3 <- getDates(data3)
timestamps3 <- timestamp(cleanData3, dates3)
timeseriesSPY3 <- timeseriesList(data3, cleanData3, timestamps3)
rm(data3, dates3, timestamps3)
data4 <- read.csv("MasterData.csv", header = T)
cleanData4 <- nanRemover(data4)
dates4 <- getDates(data4)
timestamps4 <- timestamp(cleanData4, dates4)
timeseriesSPY4 <- timeseriesList(data4, cleanData4, timestamps4)
rm(data4, dates4, timestamps4)
S1 <- c(timeseriesSPY1, timeseriesSPY2, timeseriesSPY3, timeseriesSPY4)
saveRDS(S1, "dataSPY.rds")