-
Notifications
You must be signed in to change notification settings - Fork 12
/
lxxmorph_utils.py
114 lines (101 loc) · 3.71 KB
/
lxxmorph_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from unicodedata import normalize
from greekutils.beta2unicode import convert
book_to_num = {
"Gen": 1,
"Exod": 3,
"Lev": 4,
"Num": 5,
"Deut": 6,
"JoshB": 7,
"JoshA": 8,
"JudgB": 9,
"JudgA": 10,
"Ruth": 11,
"1Sam/K": 12,
"2Esdr": 19,
"1Mac": 24,
"Jonah": 42,
"Nah": 44,
}
def convert_parse(parse):
if parse[2] in "DISOP":
result = parse[:3] + "." + parse[3:]
elif parse[2] in "N":
result = parse
if result[1] == "P" and result[0] not in "AF":
result = result[0] + "M" + result[2:]
return result
def get_words(filename):
state = 0
with open(filename) as f:
for line in f:
s = line.strip()
if state == 0: # expecting verse ref
b, cv = s.split()
c, v = cv.split(":")
if v[-1] in "abcdef": # @@@
v = v[:-1]
else:
w = 0
state = 1
elif state == 1: # expecting word line or blank link
if s:
w += 1
if s[25] == "V": # verb
yield {
"line": s,
"ref": "{:02d}.{:03d}.{:03d}.{:03d}".format(
book_to_num[b], int(c), int(v), w),
"word": normalize(
"NFKC", convert(s[:25].strip() + " ").strip()),
"type": s[25:28].strip(),
"parse": s[29:35].strip(),
"lemma": normalize(
"NFKC", convert(
s[36:52].strip() + " ").strip()),
"preverb": normalize(
"NFKC", convert(s[53:].strip() + " ").strip()),
}
else:
state = 0 # blank link so back to expecting verse ref
def trim_multiples(stem_set, part, lemma, parts):
trimmed_stems = set()
for stem in stem_set:
if stem.endswith("0"): # rarely a real stem
pass
elif stem.endswith("@"): # rarely a real stem
pass
elif part[0] == "3" and lemma.endswith(("άω", "έω", "όω", "εύω")) and \
stem.endswith(("{root}", "{athematic}", "{2nd}")):
pass
elif lemma.endswith(("ω", "ομαι")) and stem.endswith("{athematic}"):
pass
elif part[0] == "1" and lemma.endswith("έω") and \
stem.endswith(("ο", "α", "η")):
pass
elif part[0] == "1" and lemma.endswith("όω") and \
stem.endswith(("ε", "α", "η", "{athematic}")):
pass
elif part[0] == "1" and lemma.endswith("άω") and \
stem.endswith(("η", "ε", "ο")):
pass
elif part[0] == "1" and lemma.endswith("έομαι") and stem.endswith("ο"):
pass
elif part[0] == "1" and lemma.endswith("όομαι") and stem.endswith("ε"):
pass
elif part[0] == "1" and lemma.endswith("άομαι") and stem.endswith("η"):
pass
else:
trimmed_stems.add(stem)
if part == "3-" and len(stem_set) == 2:
t = sorted(stem_set)
if t[0] + "{2nd}" == t[1]:
return "{} # @1 2nd?".format(t[0])
if t[1].endswith("ι{2nd}") and t[1] == t[0][:-5] + "ι{2nd}":
return "{} # @1".format(t[0])
if len(trimmed_stems) == 1:
return "{} # @1".format(trimmed_stems.pop())
elif len(trimmed_stems) == 0:
return "{} # @mm".format(stem_set)
else:
return "{} # @m".format(trimmed_stems)