-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_synthetic_patterns.py
98 lines (76 loc) · 2.83 KB
/
generate_synthetic_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created on Feb 24, 2017
.. codeauthor: svitlana vakulenko
Generate synthetic dataset from a specific pattern for training a neural network.
For example:
1 City : Linz, Immigration : 200 .
2 City : Aspach, Immigration : 100 .
3 What is the immigration in Linz? 200 1
4 What is the immigration in Aspach? 100 2
'''
import random
CITIES = ['Wien', 'Graz', 'Linz', 'Villach', 'Salzburg', 'Klagenfurt', 'Bludenz', 'Feldkirch']
N_TABLES = 10000
N_ROWS = 2
DATA_FIELDS = ['City', 'Immigration', 'Emmigration']
PATTERN_1 = '''1 {} : {}, {} : {} .
2 {} : {}, {} : {} .
3 What is the {} in {}?\t{}\t{}
'''
# 3 data fields
PATTERN_2 = '''1 {} : {}, {} : {} , {} : {} .
2 {} : {}, {} : {}, {} : {} .
3 What is the {} in {}?\t{}\t{}
'''
'''1 City : {}, Immigration : {}, Emmigration : {} .
2 City : {}, Immigration : {}, Emmigration : {} .
3 What is the immigration in {}?\t{}\t1
4 What is the emmigration in {}?\t{}\t2
5 What is the emmigration in {}?\t{}\t2
'''
def generate_dataset(output_path, f):
'''
output_path <String>
f <List> data fields for the patterns
'''
with open(output_path, 'w') as file:
# generate N_SAMPLES random data samples
for _ in xrange(N_TABLES):
# container for the generated sample values
v = []
# place holder values 1st field
cities = CITIES[:]
v0 = []
city = random.choice(cities) # random string
v0.append(city) # random string
cities.remove(city)
v0.append(random.choice(cities)) # random string
v.append(v0)
# place holder values 2nd field
v1 = []
for _ in xrange(N_ROWS):
v1.append(random.randrange(10, 20)) # random number
v.append(v1)
# place holder values 3rd field
v2 = []
for _ in xrange(N_ROWS):
v2.append(random.randrange(10, 20)) # random number
v.append(v2)
# choose data sample to query at random
s = random.randrange(0, 2)
# choose data field to query at random
q = random.randrange(1, 3)
# define textual pattern
# pattern = PATTERN_1.format(f[0], v0[0], f[1], v1[0],
# f[0], v0[1], f[1], v1[1],
# f[1], v0[s], v1[s], s+1)
pattern = PATTERN_2.format(f[0], v[0][0], f[1], v[1][0], f[2], v[2][0],
f[0], v[0][1], f[1], v[1][1], f[2], v[2][1],
f[q], v[0][s], v[q][s], s+1)
file.write(pattern)
if __name__ == '__main__':
output_path = './data/synth_data.txt'
generate_dataset(output_path, DATA_FIELDS)