forked from NHSDigital/medicines-text-mining-tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_notebooks.py
153 lines (123 loc) · 5.13 KB
/
run_notebooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Databricks notebook source
dbutils.widgets.dropdown('source_dataset', 'source_a', ['source_a', 'source_b'])
dbutils.widgets.text('raw_input_table', '', 'raw_input_table')
dbutils.widgets.text('db', 'test_epma_autocoding', 'db')
dbutils.widgets.text('batch_size', '10', 'batch_size')
dbutils.widgets.text('notebook_root', '', 'notebook_root')
# COMMAND ----------
# MAGIC %run ./notebooks/_modules/epma_global/functions
# COMMAND ----------
MATCH_LOOKUP_FINAL_VERSION = find_git_hash_regex(dbutils.widgets.get('notebook_root'))
# assert MATCH_LOOKUP_FINAL_VERSION
SOURCE_DATASET = dbutils.widgets.get('source_dataset')
assert SOURCE_DATASET
DB = dbutils.widgets.get('db')
assert DB
BATCH_SIZE = dbutils.widgets.get('batch_size')
assert BATCH_SIZE
RAW_INPUT_TABLE = dbutils.widgets.get('raw_input_table')
if RAW_INPUT_TABLE == '':
if SOURCE_DATASET == 'source_b':
RAW_INPUT_TABLE = 'test_epma_autocoding.source_b_sample_2'
if SOURCE_DATASET == 'source_a':
RAW_INPUT_TABLE = 'epma.epmawspc2'
assert RAW_INPUT_TABLE
GROUND_TRUTH_TABLE = 'epma.epmawspc2'
assert GROUND_TRUTH_TABLE
# COMMAND ----------
# Constants - Exceptions and preprocessing
MATCH_LOOKUP_FINAL_TABLE = f'{DB}.match_lookup_final'
UNMAPPABLE_TABLE = f'{DB}.unmappable'
PREPROCESSING_OUTPUT_TABLE = f'{DB}._inter_preprocessed_inputs'
# Constants - Exact matching
EXACT_MATCH_OUTPUT_TABLE = f'{DB}._inter_exact_non_match'
EXACT_MATCH_MATCH_TABLE = f'{DB}._inter_match_lookup'
# Constants - Exact matching
ENTITY_MATCH_OUTPUT_TABLE = f'{DB}._inter_entity_non_match'
ENTITY_MATCH_MATCH_TABLE = f'{DB}._inter_match_lookup'
# Constants - fuzzy matching
FUZZY_MATCH_OUPUT_TABLE = f'{DB}._inter_match_lookup'
FUZZY_NONLINKED_TABLE = f'{DB}._cache_fuzzy_non_linked'
FUZZY_NON_LINKED_NON_MATCH_TABLE = f'{DB}._cache_fuzzy_non_linked_non_match'
# Constants - accuracy calculating
ACCURACY_TABLE = f'{DB}.accuracy'
# Constants - run id
RUN_ID = get_new_run_id(MATCH_LOOKUP_FINAL_TABLE, 'match_id')
# COMMAND ----------
# MAGIC %run ./notebooks/_pipeline_execution/run_matching_pipeline
# COMMAND ----------
PIPELINE_CONFIG = [
{ # Raw data inputs. Must be the zeroth stage
'epma_table': RAW_INPUT_TABLE,
'vtm_table': 'vtm',
'vmp_table': 'vmp',
'amp_table': 'amp',
'parsed_vtm_table': '', # parsed vtm data doesn't exist
'parsed_vmp_table': 'vmp_parsed',
'parsed_amp_table': 'amp_parsed'
},
{
'stage_id': 'exceptions_and_preprocessing',
'notebook_location': './notebooks/0_exceptions_and_preprocessing/drivers/exceptions_and_preprocessing_driver',
'raw_data_required': True,
'source_dataset': SOURCE_DATASET,
'unmappable_table': UNMAPPABLE_TABLE,
'output_table':PREPROCESSING_OUTPUT_TABLE, # Table to write non-match output to.
'match_lookup_final_table': MATCH_LOOKUP_FINAL_TABLE,
'run_id': RUN_ID,
'batch_size': BATCH_SIZE,
'execute': True
},
{
'stage_id': 'exact_match',
'notebook_location': './notebooks/1_exact_match/drivers/exact_match_driver',
'raw_data_required': True,
'input_table': PREPROCESSING_OUTPUT_TABLE, # Table to read previous stage non-match input from.
'output_table': EXACT_MATCH_OUTPUT_TABLE , # Table to write stage non-match output to.
'match_table':EXACT_MATCH_MATCH_TABLE , # Table to write stage matched output to.
'execute': True
},
{
'stage_id': 'entity_matching',
'notebook_location': './notebooks/2_entity_extraction/drivers/entity_extraction_driver',
'raw_data_required': True,
'input_table':EXACT_MATCH_OUTPUT_TABLE, # Table to read previous stage non-match input from.
'match_table':ENTITY_MATCH_MATCH_TABLE, # Table to write matched output to.
'output_table':ENTITY_MATCH_OUTPUT_TABLE,
'unmappable_table': UNMAPPABLE_TABLE,
'run_id': RUN_ID,
'execute': True
},
{
'stage_id': 'fuzzy_matching',
'notebook_location': './notebooks/3_fuzzy_matching/drivers/fuzzy_match_driver',
'raw_data_required': True,
'input_table':ENTITY_MATCH_OUTPUT_TABLE, # Table to read previous stage non-match input from
'output_table': FUZZY_MATCH_OUPUT_TABLE,
'match_lookup_final_table': MATCH_LOOKUP_FINAL_TABLE,
'unmappable_table': UNMAPPABLE_TABLE,
'fuzzy_non_linked': FUZZY_NONLINKED_TABLE,
'fuzzy_nonlinked_non_match_output':FUZZY_NON_LINKED_NON_MATCH_TABLE,
'match_lookup_final_version': MATCH_LOOKUP_FINAL_VERSION,
'run_id': RUN_ID,
'execute': True
}
# {
# 'stage_id': 'accuracy_calculating',
# 'notebook_location': './notebooks/4_accuracy_calculating/drivers/accuracy-calculating-driver',
# 'raw_data_required': True,
# 'input_table': MATCH_LOOKUP_FINAL_TABLE, # Table to read previous stage non-match input from
# 'output_table': ACCURACY_TABLE,
# 'ground_truth_table': GROUND_TRUTH_TABLE,
# 'execute': True
# }
]
# COMMAND ----------
run_matching_pipeline(PIPELINE_CONFIG)
# COMMAND ----------
match_table = spark.table("match_lookup_final")
display(match_table.select("*"))
# COMMAND ----------
unmap_table = spark.table("unmappable")
display(unmap_table.select("*"))
# COMMAND ----------