-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
127 lines (121 loc) · 4.35 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
from pathlib import Path
from dotenv import load_dotenv, set_key
from scripts import scrape, utils
STORAGE = ["local", "cloud"]
SCHEDULER_OPTIONS = ["daily", "weekly", "monthly"]
DEFAULT_URLS_PATH = "runtime_files/urls.txt"
DEFAULT_PAYLOAD_PATH = "runtime_files/payload.json"
if __name__ == "__main__":
load_dotenv()
creds_username = os.environ.get("OXY_USERNAME") or utils.ask("Oxylabs Username:")
creds_password = os.environ.get("OXY_PASSWORD") or utils.ask("Oxylabs Password:")
env_file_path = Path(".env")
env_file_path.touch()
set_key(env_file_path, key_to_set="OXY_USERNAME", value_to_set=creds_username)
set_key(env_file_path, key_to_set="OXY_PASSWORD", value_to_set=creds_password)
params = {
"creds_username": creds_username,
"creds_password": creds_password,
"urls_path": utils.ask(
"Enter path to file containing URLs. "
"Leave empty if you wish to use default from `runtime_files/urls.txt`.",
)
or DEFAULT_URLS_PATH,
"payload_path": utils.ask(
"Enter path to file containing request payload. "
"Leave empty if you wish to use default from `runtime_files/payload.json`.",
)
or DEFAULT_PAYLOAD_PATH,
"storage": STORAGE[
utils.get_user_choice(
"Select where you wish to store the results.",
["Locally", "Cloud"],
)
],
}
if params["storage"] == "local":
params.update(
{
"output_path": utils.ask(
"Full path to existing directory where results should be stored:",
)
}
)
else:
params.update(
{
"output_path": utils.ask(
"Path to cloud bucket and directory/partition "
"where results should be stored:"
)
}
)
if params["storage"] == "cloud":
params.update(
{
"schedule": utils.ask(
"Do you want to schedule urls to be scraped repetitively?(y/n)",
).lower()
in ("yes", "y")
}
)
if params["schedule"]:
params.update(
{
"frequency": SCHEDULER_OPTIONS[
utils.get_user_choice(
"Select frequency.",
["Daily", "Weekly", "Monthly"],
)
]
}
)
params.update(
{
"time": utils.ask(
"Specify hour of the day you wish to run scraping (e.g. 12):",
),
}
)
if params["frequency"] == "weekly":
params.update(
{
"weekday": utils.get_user_choice(
"Select on which day it should run.",
[
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
],
)
}
)
elif params["frequency"] == "monthly":
params.update(
{
"month_day": int(
utils.ask(
"Specify day of the month requests should be run:"
)
)
}
)
params.update(
{
"end_datetime": utils.ask(
"Specify date and time when scheduler should "
"stop (e.g `2032-12-21 12:34:45`).\nIf you think you will stop "
"it manually, still enter some date far in the future."
)
}
)
print("\n\033[1mParameters collected. Initiating jobs.\033[0m")
if params["storage"] == "local":
scrape.once_store_local(params)
elif params["storage"] == "cloud":
scrape.store_cloud(params)