-
Notifications
You must be signed in to change notification settings - Fork 0
/
upload.py
197 lines (161 loc) · 7.1 KB
/
upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# encoding: utf-8
"""Script to upload job suggestions to Algolia.
It relies on environment variables to be set correctly:
ALGOLIA_APP_ID: the Algolia App to update
ALGOLIA_JOB_INDEX: the index to update in this App
ALGOLIA_API_KEY: an API key that has enough permissions to edit the index.
The script takes two arguments:
- a path to the csv file containing the definition of jobs in the ROME,
- a path to the csv file containing the definition of job groups in the ROME,
- a path to the file with correspondance from ROME job group to FAP codes,
- a path to the JSON file with job frequencies.
"""
import codecs
import collections
import json
import os
import re
import sys
import time
from algoliasearch import exceptions
from algoliasearch import search_client
import pandas
import rome_genderization
# Regular expression to match mapping of ROME to FAP codes.
# Matches strings like '"A1201","A1205" = "A0Z42"'
_ROME_FAP_MAPPING_REGEXP = re.compile(
r'^(?P<rome_ids>(?:"[A-Z]\d{4}",?)+)\s*=\s*"(?P<fap>[A-Z]\d[A-Z]\d\d)"$')
# Regular expression to match unaccented capital E in French text that should
# be capitalized. It has been computed empirically by testing on the full ROME.
# It matches the E in "Etat", "Ecrivain", "Evolution", "Energie", "Enigme" but
# not in "Entreprise", "Ethnologue", "Emoji", "Enivrer" nor "Euro".
_UNACCENTED_E_REGEXP = (
r'E(?=('
'([bcdfghjklpqrstvz]|[cpt][hlr])[aeiouyéèêë]|'
'n([eouyéèêë]|i[^v]|a[^m])|'
'm([aeiuyéèêë]|o[^j])))')
_ExtraJob = collections.namedtuple('ExtraJob', ['code_ogr', 'code_ogr_origin', 'name'])
_EXTRA_JOBS = [
_ExtraJob('126549a', '126549', 'Product designer'),
]
def csv_to_dicts(
csv_appellation, csv_code_rome, txt_fap_rome, json_jobs_frequency):
# Read appellations from CSV.
appellations = pandas.read_csv(csv_appellation)
appellations['code_ogr'] = appellations['code_ogr'].astype(str)
# Add missing accents.
_add_accents(
appellations, ('libelle_appellation_court', 'libelle_appellation_long'))
# Genderize names.
_genderize(appellations, 'libelle_appellation_court')
_genderize(appellations, 'libelle_appellation_long')
# Join with ROME names.
code_rome = pandas.DataFrame(
pandas.read_csv(csv_code_rome),
columns=['code_rome', 'libelle_rome'])
_add_accents(code_rome, ('libelle_rome',))
suggestions = pandas.merge(
appellations, code_rome, on='code_rome', how='left')
# Join with FAP code when simple.
rome_to_fap = _fap_rome_simple_mapping(txt_fap_rome)
suggestions = pandas.merge(
suggestions, rome_to_fap, on='code_rome', how='left')
# Join with jobs frequency from exernal file.
with open(json_jobs_frequency) as jobs_frequency_file:
jobs_frequency = json.load(jobs_frequency_file)
suggestions['frequency'] = (
suggestions['code_ogr'].map(jobs_frequency).fillna(0))
# Add extra jobs.
extra_jobs = []
for extra_job in _EXTRA_JOBS:
origin = suggestions[suggestions.code_ogr == extra_job.code_ogr_origin]
if len(origin) != 1:
raise ValueError('Error while locating the origin for {}'.format(extra_job))
new_job = origin.iloc[0, :].copy(deep=True)
new_job.code_ogr = extra_job.code_ogr
for size in ('court', 'long'):
for gender in ('', '_masculin', '_feminin'):
new_job['libelle_appellation_{}{}'.format(size, gender)] = extra_job.name
extra_jobs.append(new_job)
suggestions = suggestions.append(extra_jobs, ignore_index=True)
# Swith properties to camelCase.
mapping = {
name: _snake_to_camel_case(name)
for name in suggestions.columns.tolist()}
suggestions.rename(columns=mapping, inplace=True)
# Convert from pandas.DataFrame to Python list of dicts.
records = suggestions.to_dict(orient='records')
return [
{k: v for k, v in record.items() if not pandas.isnull(v)}
for record in records
]
def upload(csv_appellation, csv_code_rome, txt_fap_rome, json_jobs_frequency):
"""Upload jobs suggestions to Algolia."""
suggestions = csv_to_dicts(
csv_appellation, csv_code_rome, txt_fap_rome, json_jobs_frequency)
client = search_client.SearchClient.create(
os.getenv('ALGOLIA_APP_ID'), os.getenv('ALGOLIA_API_KEY'))
index_name = os.getenv('ALGOLIA_JOB_INDEX', 'jobs')
job_index = client.init_index(index_name)
tmp_index_name = '%s_%x' % (index_name, round(time.time()))
tmp_job_index = client.init_index(tmp_index_name)
try:
tmp_job_index.set_settings(job_index.get_settings())
tmp_job_index.save_objects(suggestions, {
'autoGenerateObjectIDIfNotExist': True,
})
# OK we're ready finally replace the index.
if not os.getenv('DRY_RUN'):
client.move_index(tmp_index_name, index_name)
except exceptions.AlgoliaException:
tmp_job_index.delete()
print(json.dumps(suggestions[:10], indent=2))
raise
def _snake_to_camel_case(snake_name):
components = snake_name.split('_')
return components[0] + "".join(x.title() for x in components[1:])
def _genderize(data_frame, field, suffixes=('_masculin', '_feminin')):
"""Update a pandas DataFrame by genderizing one if its column.
Args:
data_frame: the DataFrame to update.
field: the name of the column to genderize.
suffixes: the suffixes of the new column to create.
"""
masculine, feminine = rome_genderization.genderize(data_frame[field])
data_frame[field + suffixes[0]] = masculine
data_frame[field + suffixes[1]] = feminine
def _add_accents(data_frame, fields):
"""Add an accent on capitalized letters if needed.
Most of the capitalized letters have no accent even if the French word
would require one. This function fixes this by using heuristics.
"""
for field in fields:
data_frame[field] = data_frame[field].str.replace(
_UNACCENTED_E_REGEXP, 'É')
def _fap_rome_simple_mapping(txt_fap_rome):
"""Return mappings from ROME to FAP when non ambiguous.
Many ROME job groups are included completely in one FAP group, so for them
a mapping is possible from ROME ID to FAP. This function extract those
simple mappings.
Args:
txt_fap_rome: path of a file containing the official mapping.
Returns:
a pandas DataFrame with two columns: "code_rome" and "code_fap".
"""
mapping = []
with codecs.open(txt_fap_rome, 'r', 'latin-1') as fap_rome:
for line in fap_rome:
matches = _ROME_FAP_MAPPING_REGEXP.match(line.strip())
if not matches:
continue
rome_ids_str = matches.group('rome_ids')
# Splitting and removing quotes from: "A1201","A1205","A1206"
rome_ids = rome_ids_str[1:len(rome_ids_str)-1].split('","')
fap_id = matches.group('fap')
for rome_id in rome_ids:
mapping.append((rome_id, fap_id))
mapping_df = pandas.DataFrame(mapping)
mapping_df.columns = ['code_rome', 'code_fap']
return mapping_df
if __name__ == '__main__':
upload(*sys.argv[1:])