-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtasks.py
150 lines (123 loc) · 4.63 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import sys
import os
import subprocess
from invoke import task
OUTPUT_FILENAME_TEMPLATE = "{language}-{model}.txt"
SEARCH_MODELS = [
("bm25",),
("bm25.accurate",),
("bm25prf", "bm25"),
("qld",),
("rm3", "bm25"),
("rm3", "qld"),
("sdm", "bm25"),
("sdm", "qld"),
]
def check_language_arg(language):
if language not in ('ja', 'en'):
print('Error: language option must be either ja or en.')
sys.exit(1)
@task
def submodule(c):
"""
Obtain anserini as a submodule.
"""
c.run("git submodule update -i", pty=True)
@task(submodule)
def build(c, recompile=False):
"""
Compiles Java files of anserini and outputs executable files
at `anserini/target/appassembler/bin`.
"""
print("Trying to build anserini ...")
if os.path.exists("anserini/target/anserini-0.6.1-SNAPSHOT.jar")\
and not recompile:
print()
print("Already compiled anserini. Stopped building it again.\n"
"Set --recompile option for enforcing the re-compilation.\n")
return
cmd = "cd anserini && mvn clean package appassembler:assemble"
result = c.run(cmd, pty=True)
if result.ok:
print()
print("Compiled anserini successfully.\n")
@task(build)
def preprocess(c, language, input_filepath, output_dirpath):
"""
Reads a collection file [input_filepath]
assuming that it is written in [language] (`ja` or `en`),
and generates multiple JSONL files in [output_dirpath].
"""
check_language_arg(language)
print("Start preprocessing {} collection '{}'\n"
"and saving the results into {} ...\n"
.format(language, input_filepath, output_dirpath))
if language == 'ja':
from baselines.preprocessor_ja import Preprocessor
preprocessor = Preprocessor(output_dirpath)
else:
from baselines.preprocessor_en import Preprocessor
preprocessor = Preprocessor(output_dirpath)
preprocessor.run(input_filepath)
@task(build)
def index(c, language, input_dirpath, output_dirpath, threads=4):
"""
Reads JSONL files at [input_dirpath]
assuming that they are written in [language] (`ja` or `en`),
and indexes the JSONL files into [output_dirpath].
"""
check_language_arg(language)
cmd = "anserini/target/appassembler/bin/IndexCollection "\
"-collection JsonCollection -generator LuceneDocumentGenerator "\
"-storePositions -storeDocvectors -storeRawDocs "\
"-language {} -input {} -index {} -threads {}"\
.format(language, input_dirpath, output_dirpath, threads)
result = c.run(cmd, pty=True)
if result.ok:
print()
print("Created the index successfully.\n")
@task(build)
def search(c, language, index_dirpath, topic_filepath, output_dirpath):
"""
Retrieves documents from [index_dirpath] of [language] (`ja` or `en`)
for queries in [topic_filepath],
and outputs the results into [output_dirpath].
"""
check_language_arg(language)
if not os.path.exists(output_dirpath):
os.mkdir(output_dirpath)
base_cmd = "anserini/target/appassembler/bin/SearchCollection "\
"-topicreader TsvString "\
"-language {} -index {} -topics {} ".format(
language, index_dirpath, topic_filepath
)
for model in SEARCH_MODELS:
model_name = "+".join(model)
output_filename = OUTPUT_FILENAME_TEMPLATE.format(language=language,
model=model_name)
output_filepath = os.path.join(output_dirpath, output_filename)
cmd = base_cmd + "-{} -output {}".format(" -".join(model),
output_filepath)
result = c.run(cmd, pty=True)
if result.ok:
print()
print("Retrieved datasets by {} and saved them into {}.\n"
.format(model_name, output_filepath))
@task
def ntcirify(c, input_filepath, output_filepath, sysdesc="official baseline"):
"""
Reads a TREC format run file and transforms it to a NTCIR format run file.
TREC: [TOPIC_ID] Q0 [DATASET_ID] [RANK] [SCORE] [RUN_NAME]
NTCIR: [TOPIC_ID] 0 [DATASET_ID] [RANK] [SCORE] [RUN_NAME]
"""
with open(input_filepath) as fr, open(output_filepath, "w") as fw:
fw.write('<SYSDESC>{}</SYSDESC>\n'.format(sysdesc))
for line in fr:
fields = line.split(" ")
if fields[1] == 'Q0':
fields[1] = '0'
line = ' '.join(fields)
fw.write(line)
print()
print("Transformed a TREC file '{}' to an NTCIR file '{}'.\n"
.format(input_filepath, output_filepath))