-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtext2sql_data.py
More file actions
70 lines (60 loc) · 2.91 KB
/
text2sql_data.py
File metadata and controls
70 lines (60 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# coding=utf8
"""
Parsing data from https://github.com/jkkummerfeld/text2sql-data/tree/master/data
"""
import os
import json
import copy
def get_sql_data(basepath, raw_data_path):
with open(raw_data_path, 'r') as f:
data = json.load(f)
question_based_train_dataset, question_based_dev_dataset, question_based_test_dataset = list(), list(), list()
query_based_train_dataset, query_based_dev_dataset, query_based_test_dataset = list(), list(), list()
for d in data:
sql = d['sql'][0]
sentences = d['sentences']
for s_dict in sentences:
s = s_dict['text']
_sql = copy.copy(sql)
for name in s_dict['variables']:
value = s_dict['variables'][name]
if len(value) == 0:
for variable in d['variables']:
if variable['name'] == name:
value = variable['example']
s = value.join(s.split(name))
_sql = value.join(_sql.split(name))
if s_dict['question-split'] == 'test':
question_based_test_dataset.append("%s\t%s" % (s, _sql))
elif s_dict['question-split'] == 'dev':
question_based_dev_dataset.append("%s\t%s" % (s, _sql))
else:
question_based_train_dataset.append("%s\t%s" % (s, _sql))
if d['query-split'] == 'test':
query_based_test_dataset.append("%s\t%s" % (s, _sql))
elif d['query-split'] == 'dev':
query_based_dev_dataset.append("%s\t%s" % (s, _sql))
else:
query_based_train_dataset.append("%s\t%s" % (s, _sql))
save_train_path, save_dev_path, save_test_path = os.path.join(base_path, 'atis_sql_question_based_train_2018.tsv'), \
os.path.join(basepath, 'atis_sql_question_based_dev_2018.tsv'), \
os.path.join(base_path, 'atis_sql_question_based_test_2018.tsv')
with open(save_train_path, 'w') as f:
f.write('\n'.join(question_based_train_dataset))
with open(save_dev_path, 'w') as f:
f.write('\n'.join(question_based_dev_dataset))
with open(save_test_path, 'w') as f:
f.write('\n'.join(question_based_test_dataset))
save_train_path, save_dev_path, save_test_path = os.path.join(base_path, 'atis_sql_query_based_train_2018.tsv'), \
os.path.join(base_path, 'atis_sql_query_based_dev_2018.tsv'), \
os.path.join(base_path, 'atis_sql_query_based_test_2018.tsv')
with open(save_train_path, 'w') as f:
f.write('\n'.join(query_based_train_dataset))
with open(save_dev_path, 'w') as f:
f.write('\n'.join(query_based_dev_dataset))
with open(save_test_path, 'w') as f:
f.write('\n'.join(query_based_test_dataset))
if __name__ == '__main__':
base_path = os.path.join('data', 'atis')
raw_data_path = os.path.join('data', 'atis', 'atis.json')
get_sql_data(base_path, raw_data_path)