-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpreprocessor.py
More file actions
82 lines (71 loc) · 2.54 KB
/
preprocessor.py
File metadata and controls
82 lines (71 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function, absolute_import, division
import codecs
import glob
import os
import pandas as pd
from six.moves import configparser
class TextPreProcessor(object):
@staticmethod
def clean_text(text, word_level):
"""
:param text: the string of text
:param word_level: word_level to cut
:return: text. the text is cleaned and then joined by space
"""
pass
def get_raw_data(data_pt, pair_tag, label_tag, is_test=False, is_dir=True):
"""
:param data_pt: train or test data path. or the dir of data.
:param pair_tag:
:param label_tag:
:param is_test:
:param is_dir: if the data_pt is directory
:return: dataFrame. text pair cleaned and origin and the label(test for -1 pad)
"""
from .preprocessor import TextPreProcessor
if is_dir:
data_pt = glob.glob(os.path.join(data_pt, '*.csv'))
else:
data_pt = [data_pt]
data = None
for pt in data_pt:
tmp = read_csv(pt, pair_tag, label_tag, is_test)
if data is None:
data = tmp
else:
data = pd.concat([data, tmp])
data.reset_index(inplace=True)
data['q1_word'] = data['question1'].apply(lambda x: TextPreProcessor.clean_text(x, True))
data['q2_word'] = data['question2'].apply(lambda x: TextPreProcessor.clean_text(x, True))
data['q1_char'] = data['question1'].apply(lambda x: TextPreProcessor.clean_text(x, False))
data['q2_char'] = data['question2'].apply(lambda x: TextPreProcessor.clean_text(x, False))
return data
def read_csv(data_pt, pair_tag, label_tag, is_test):
with codecs.open(data_pt, 'r', encoding='utf-8') as f:
lnums, q1s, q2s, labels = [], [], [], []
for i, line in enumerate(f, start=1):
if not is_test:
lnum, q1, q2, label = line.strip().split('\t')
else:
lnum, q1, q2 = line.strip().split('\t')
lnums.append(i)
q1s.append(unicode(q1))
q2s.append(unicode(q2))
if not is_test:
labels.append(int(label))
else:
labels.append(-1)
df = pd.DataFrame({
pair_tag: lnums,
'question1': q1s,
'question2': q2s,
label_tag: labels
})
return df
class Conf(object):
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'conf/config.conf'))
pair_tag = 'pair_id'
label_tag = 'is_duplicate'