-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre_tokenizer.py
executable file
·163 lines (138 loc) · 6.4 KB
/
pre_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import ntpath
from tqdm import tqdm
from typing import Optional, Set
import argparse
import logging
class PreTokenizer:
_NOT_TO_SPlIT: Set[str]
rule_file = 'bgupreflex_withdef.utf8'
rule_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'rules', rule_file))
def __init__(self, input_f, output_f, use_unichar=True, separator='', improved_mode=True):
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.FileHandler("PreTokenizer.log"), logging.StreamHandler()])
self.logger = logging.getLogger("PreTokenizer")
if improved_mode:
self.act = self.pre_tok_improved
else:
self.act = self.pre_tok_classic
self._NOT_TO_SPlIT = {'של', 'שלכם', 'שלנו', 'שלהם', 'שלך', 'שלי', 'מי', 'מה'}
self.rules = self.get_rules()
if not use_unichar:
self.rules = [r for r in self.rules if len(r[0]) > 1]
self.separator = separator.strip('"\'')
if separator.isdigit() or separator.isspace() or separator.isalpha():
raise ValueError(f"The separator is {separator} - but it can't be a number, space or only letters.")
self.rule_d = dict(self.rules)
self.prefix_rules = set([list(r[0])[0] for r in self.rules])
self.logger.info(f"Input file: {input_f}")
self.logger.info(f"Output file: {output_f}")
if self.separator == "":
self.logger.info(f"Not using separator")
else:
self.logger.info(f"Separator used: {self.separator}")
self.logger.info(f"Using unichar for separation: {use_unichar}")
self.logger.info(f"Run on improved mode: {improved_mode}")
@staticmethod
def line2rule(line: str) -> Optional[tuple]:
if len(line) < 2:
return
lsplited = line.split()
chars = lsplited.pop(0)
chars_splits = list(set([w for w in lsplited if '^' in w or w == chars]))
res = [chars]
for cs in chars_splits:
css = [i for i in list(cs) if i != "^"]
if all([c in chars for c in css]):
res.append(cs)
return tuple(res)
def get_rules(self, path=rule_path):
rules = []
with open(path, mode="r", encoding='utf-8') as f:
for l in f:
rule = self.line2rule(l)
if rule is not None:
rules.append(rule)
rules = sorted(rules, key=lambda x: len(x[0]), reverse=True)
return rules
def pre_tok_improved(self, text: str) -> str:
res = ''
txt_split = text.split()
for t in txt_split:
if any([t.startswith(c) for c in self.prefix_rules]) and t not in self._NOT_TO_SPlIT:
lp = self.get_longest_prefix(t)
if lp is None or len(t) < len(lp) + 2:
res += f" {t}"
continue
rule = self.rule_d[lp]
res += self.break_word(t, rule)
else:
res += f" {t}"
return res[1:] # remove the first redundant space
def pre_tok_classic(self, text: str) -> str:
res = ''
txt_split = text.split()
for t in txt_split:
if any([t.startswith(c) for c in self.prefix_rules]):
lp = self.get_longest_prefix(t)
if lp is None:
res += f" {t}"
continue
rule = self.rule_d[lp]
res += self.break_word(t, rule)
else:
res += f" {t}"
return res[1:] # remove the first redundant space
def get_longest_prefix(self, t):
for r in self.rules:
if t.startswith(r[0]): # rules are sorted from the longest to shortest
return r[0]
return None
def break_word(self, word, rule):
sub_t = rule.split('^')
suffix = word.split("".join(sub_t), 1)[1]
res = " " + f"{self.separator} ".join(sub_t) + f"{self.separator} " + suffix
return res
def split_file(self, path, out_path=None):
if out_path is None:
name = ntpath.basename(path)
name += ".splitted"
out_path = os.path.abspath(os.path.join(path, name))
res = ""
with open(path, mode="r", encoding='utf-8') as f:
for l in tqdm(f):
res += self.act(l) + "\n"
with open(out_path, mode="w", encoding='utf-8') as fo:
fo.write(res)
if __name__ == '__main__':
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description='Pre split an Hebrew text.')
parser.add_argument('input', type=str, help='The path to he input file')
parser.add_argument('output', type=str,
help='The path to he output file, if not supplied the output file default would be ['
'InputFile].splitted')
parser.add_argument('-unichar', type=str2bool, default=True,
help="If False, do not use the one chars to break the sentence The default is True i.e. "
"to break also unichars as Hey (the 5th letter in the Alef Bet),"
" Bet (the 2th letter in the Alef Bet) etc.")
parser.add_argument('-improved', type=str2bool, default=True,
help="If True wouldn't split a close set of common words and run with a condition that "
"the algorithm would split a word only if the len of the word is bigger or equal to"
" the len of the prefix. By default True, unless run with False ")
parser.add_argument('-separator', type=str, default='',
help="A sign to seperate every char, for example using the flag \n"
"-separator $$\n"
" will seperate a [to-ken] to to$$ ken. The default is ''")
args = parser.parse_args()
input_path = args.input
output_path = args.output
pt = PreTokenizer(input_path, output_path, args.unichar, args.separator, args.improved)
pt.split_file(input_path, output_path)