forked from CenterForOpenScience/scrapi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtasks.py
170 lines (126 loc) · 4.13 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import logging
import platform
import urllib
from invoke import run, task
from elasticsearch import helpers
import scrapi.harvesters # noqa
from scrapi import linter
from scrapi import registry
from scrapi import settings
from scrapi.processing.elasticsearch import es
logger = logging.getLogger()
@task
def server():
run("python server.py")
@task
def reindex(src, dest):
helpers.reindex(es, src, dest)
es.indices.delete(src)
@task
def alias(alias, index):
es.indices.delete_alias(index=alias, name='_all', ignore=404)
es.indices.put_alias(alias, index)
@task
def renormalize():
run('python -m scripts.renormalize')
@task
def reset_search():
run("curl -XPOST 'http://localhost:9200/_shutdown'")
if platform.linux_distribution()[0] == 'Ubuntu':
run("sudo service elasticsearch restart")
elif platform.system() == 'Darwin': # Mac OSX
run('elasticsearch')
@task
def elasticsearch():
'''Start a local elasticsearch server
NOTE: Requires that elasticsearch is installed. See README for instructions
'''
if platform.linux_distribution()[0] == 'Ubuntu':
run("sudo service elasticsearch restart")
elif platform.system() == 'Darwin': # Mac OSX
run('elasticsearch')
else:
print(
"Your system is not recognized, you will have to start elasticsearch manually")
@task
def test(cov=True, verbose=False):
"""
Runs all tests in the 'tests/' directory
"""
cmd = 'py.test tests'
if verbose:
cmd += ' -v'
if cov:
cmd += ' --cov-report term-missing --cov-config .coveragerc --cov scrapi'
run(cmd, pty=True)
@task
def requirements():
run('pip install -r requirements.txt')
@task
def beat():
from scrapi.tasks import app
app.conf['CELERYBEAT_SCHEDULE'] = registry.beat_schedule
app.Beat().run()
@task
def worker():
from scrapi.tasks import app
app.worker_main(['worker', '--loglevel', 'info'])
@task
def harvester(harvester_name, async=False, days=1):
settings.CELERY_ALWAYS_EAGER = not async
from scrapi.tasks import run_harvester
if not registry.get(harvester_name):
raise ValueError('No such harvesters {}'.format(harvester_name))
run_harvester.delay(harvester_name, days_back=days)
@task
def harvesters(async=False, days=1):
settings.CELERY_ALWAYS_EAGER = not async
from scrapi.tasks import run_harvester
exceptions = []
for harvester_name in registry.keys():
try:
run_harvester.delay(harvester_name, days_back=days)
except Exception as e:
logger.exception(e)
exceptions.append(e)
logger.info("\n\nNumber of exceptions: {}".format(len(exceptions)))
for exception in exceptions:
logger.exception(e)
@task
def check_archive(harvester=None, reprocess=False, async=False, days=None):
settings.CELERY_ALWAYS_EAGER = not async
if harvester:
from scrapi.tasks import check_archive as check
check.delay(harvester, reprocess, days_back=int(days))
else:
from scrapi.tasks import check_archives
check_archives.delay(reprocess, days_back=int(days))
@task
def lint_all():
for name in registry.keys():
lint(name)
@task
def lint(name):
harvester = registry[name]
try:
linter.lint(harvester.harvest, harvester.normalize)
except Exception as e:
print('Harvester {} raise the following exception'.format(harvester.short_name))
print(e)
@task
def provider_map():
from scrapi.processing.elasticsearch import es
for harvester_name, harvester in registry.items():
es.index(
'share_providers',
harvester.short_name,
body={
'favicon': 'data:image/png;base64,' + urllib.quote(open("img/favicons/{}_favicon.ico".format(harvester.short_name), "rb").read().encode('base64')),
'short_name': harvester.short_name,
'long_name': harvester.long_name,
'url': harvester.url
},
id=harvester.short_name,
refresh=True
)
print(es.count('share_providers', body={'query': {'match_all': {}}})['count'])