-
Notifications
You must be signed in to change notification settings - Fork 0
/
_aux.py
208 lines (196 loc) · 9.56 KB
/
_aux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# import packages
import asyncio
import aiohttp
import pandas
import os
from tqdm import tqdm
async def get_hcd(session, url):
"""
The function requests the head content disposition from an URL and returns it together with the URL if an key error
occurs only the url is returned.
Parameters
----------
session: aiohttp.client.ClientSession
the client session
url: str
The URL from which the header information should be read.
Returns
-------
head_content_disposition + "__" + url: srt
The head content disposition plus the corresponding URL as one string.
url: str
the URL
"""
# if there is no answer to the request a key error will occur
try:
# request head
async with session.head(url) as resp:
# select hcd
head_content_disposition = resp.headers["content-disposition"]
# return hcd + url
return head_content_disposition + "__" + url
# if an key error occurs return the url
except KeyError:
return url
# framework function for the request function
async def framework_requests(start=0, stop=0, list_of_ids=None):
"""
In this function first the framework for the request function is set. Then the request function is called repeatedly
to get the head content dispositions of the URLs.
Parameters
----------
start: int
The number of the id part of the first URL to be checked.
stop: int
The number of the id part of the last URL to be checked.
list_of_ids: list of str or None
This Parameter should not be changed. It only plays a role in the additional checking of the key error URLs.
Returns
-------
hcd__url_list: list of str
A list with the head content dispositions plus the corresponding URLs.
url_key_error_list: list of str
A list with all URLs where a key error occurred.
"""
# set timeout time / time limit
timeout = aiohttp.ClientTimeout(total=12000)
# create client session so that not every request will open an new connection
async with aiohttp.ClientSession(timeout=timeout) as session:
# create a tasks, hcd + url and only url(KeyError) list
tasks = list()
hcd__url_list = list()
url_key_error_list = list()
# check if there is a list of ids
if list_of_ids is not None:
# loop through the list
for number in list_of_ids:
# set url
url = "https://geoportal.geoportal-th.de/gaialight-th/_apps/dladownload/download.php?type=op&id=" + \
str(number)
# append task to tasks list asyncio.ensure_future schedules the execution of the task
tasks.append(asyncio.ensure_future(get_hcd(session, url)))
# if there is no list there should be a range of values
else:
for number in range(start, stop):
url = "https://geoportal.geoportal-th.de/gaialight-th/_apps/dladownload/download.php?type=op&id=" + \
str(number)
tasks.append(asyncio.ensure_future(get_hcd(session, url)))
# gather all Future tasks(executions of request function) and wait till they are finished
hcd__url_all = await asyncio.gather(*tasks)
# loop through the returns
for hcd__url in hcd__url_all:
# check if the return is hcd + url or only the url and append them to the appropriate list
if hcd__url[0] != "h":
hcd__url_list.append(hcd__url)
else:
url_key_error_list.append(hcd__url)
# return hcd + url and only url(KeyError) list
return hcd__url_list, url_key_error_list
def create_url_id_file(start, stop, out_path, number_of_requests=100):
"""
This function creates CSV files with the id part ot the URL, the year of data acquisition and the tile number or
only the URL (if it is a key error URL) as columns.To achieve this, first the framework function is called to get
the head content dispositions or in case of a key error the URL. Then the information is filtered and the CSV files
are generated. Finally, the content of all non-key error CSV files is merged into one file.
Parameters
----------
start: int
The number of the id part of the first URL to be checked.
stop: int
The number of the id part of the last URL to be checked.
out_path: str
Path to the folder where the output should be stored.
number_of_requests: int
Maximum number of concurrent requests (if the performance is not important, the default value should be kept).
Returns
-------
path_name: str
The path to the URL id file.
"""
# prevents RuntimeError: Event loop is closed
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
for i1 in tqdm(range(start, stop, number_of_requests)):
# run framework function
hcd__url_re_list, url_key_error_re_list = asyncio.run(framework_requests(start=i1, stop=i1+number_of_requests))
# cause the urls are sometimes not reachable for a short time
# every url that is unreachable is checked for a second an if necessary even a third time
for i2 in range(0, 2):
if len(url_key_error_re_list) > 0:
# create list of error url ids
error_url_id_list = list()
# get url ids
for error_url_re in url_key_error_re_list:
error_url_id = ""
for i3 in range(6, 1, -1):
if error_url_re[len(error_url_re) - i3:].isdigit() is True:
error_url_id = int(error_url_re[len(error_url_re) - i3:])
break
# append url id to url id list
error_url_id_list.append(error_url_id)
# run framework function with the error url id list as input
hcd__url_re_list2, url_key_error_re_list = \
asyncio.run(framework_requests(list_of_ids=error_url_id_list))
# add hcd + ulr to the already existing hcd + ulr list
hcd__url_re_list = hcd__url_re_list + hcd__url_re_list2
# if the returned hcd + url list is not empty loop though the list
if len(hcd__url_re_list) != 0:
# create a list for the url ids, the years and the tile numbers
url_id_list = list()
year_list = list()
tile_number_list = list()
# get url id
for hcd__url_re in hcd__url_re_list:
url_id = ""
for i4 in range(6, 1, -1):
if hcd__url_re[len(hcd__url_re) - i4:].isdigit() is True:
url_id = int(hcd__url_re[len(hcd__url_re) - i4:])
break
# get year
th_position = hcd__url_re.find("th")
year = hcd__url_re[th_position + 3:th_position + 7]
# get tile number
_32_position = hcd__url_re.find("_32_")
if _32_position != -1:
tile_number = hcd__url_re[_32_position + 4:_32_position + 12]
else:
_32_position = hcd__url_re.find("_32")
tile_number = hcd__url_re[_32_position + 3:_32_position + 11]
# sort out some data that is not relevant
if tile_number == "tachment":
continue
# append url ids an so on to the appropriate list
url_id_list.append(url_id)
year_list.append(year)
tile_number_list.append(tile_number)
# create df with the url id, year and tile number as columns
data = {"url_id": url_id_list, "year": year_list, "tile_number": tile_number_list}
url_id_df = pandas.DataFrame(data)
# set name for the csv file
name = str(i1) + "_" + str(i1+number_of_requests) + "_url_id_file.csv"
# export df as csv
url_id_df.to_csv(out_path+name, index=False)
# if url(KeyError) list is not empty export a csv data with the "error" urls
if len(url_key_error_re_list) != 0:
error_data = {"error_urls": url_key_error_re_list}
error_df = pandas.DataFrame(error_data)
name = str(i1) + "_" + str(i1+number_of_requests) + "_url_KeyError.csv"
error_df.to_csv(out_path+name, index=False)
# create list for the filenames
filenames_list = list()
# loop through files in the out_path directory
for file in os.scandir(out_path):
# convert to string cause argument of type 'nt.DirEntry' is not iterable
filename = str(file)
# check if the file is relevant
if "KeyError" not in filename and "url_id_file" in filename:
# add filename to filename list
filenames_list.append(filename[11:len(filename)-2])
try:
# combine the content of all the files
combined_content = pandas.concat([pandas.read_csv(out_path+filename) for filename in filenames_list])
# create a csv file with the combined content
path_name = out_path+"url_id_file.csv"
combined_content.to_csv(path_name, index=False)
return path_name
except ValueError:
print("There are no objects to concatenate. Probably there are no relevant files.")