-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtinytokenizer.py
More file actions
78 lines (64 loc) · 2.52 KB
/
tinytokenizer.py
File metadata and controls
78 lines (64 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
A tiny tokenizer
for multi-lingual data
author: Barbara Plank
# Note: needs the regex package, as the package "re" does not capture indic vowel markers in \w
# this is a bug is cf. http://stackoverflow.com/questions/12746458/
"""
import re
import regex
import sys
import argparse
parser = argparse.ArgumentParser(description="""simple tokenizer, inspired by Christopher Pott's twitter tokenizer; expects one sentence per line""")
parser.add_argument("infile", help="file with one sentence per line")
parser.add_argument("--conll", help="conll format rather than one sentence per line", required=False,default=False,action="store_true")
class TinyTokenizer(object):
"""
TinyTokenizer
"""
def __init__(self):
pattern = '''
(?:[+\-]?\d+[,/.:-]\d+) # Numbers, including fractions, decimals.
|
(?:[\w]+[-]+[\w]+) # don't split words with dashes
|
(?:[\w_]+) # Words without dashes
|
(?:\S) # Everything else that isn't whitespace.
'''
self.word_re = regex.compile(pattern, regex.VERBOSE | regex.UNICODE | regex.I) # use regex for save unicode handling
def tokenize(self,line):
"""
return list of tokens
"""
line = regex.sub(r"\s+"," ",line) # remove extra spaces
return [w.group() for w in self.word_re.finditer(line)]
def main():
args = parser.parse_args()
tt = TinyTokenizer()
for line in open(args.infile):
line=line.strip()
out = tt.tokenize(line)
outline = " ".join(out)
try:
assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
if args.conll:
for w in out:
print(w)
print()
else:
print(outline)
except:
print("==== CHECK FILE! ====", args.infile, file=sys.stderr)
print("+"*20, file=sys.stderr)
print("in: >>{}<<".format(line), file=sys.stderr)
print("out: >>{}<<".format(outline), file=sys.stderr)
print(str(regex.sub(r"\s","",line)), file=sys.stderr)
print(str(regex.sub(r"\s","",outline)), file=sys.stderr)
if __name__=="__main__":
if (sys.version_info < (3, 0)):
print("needs python 3")
exit()
main()