-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_feature_extraction.py
39 lines (29 loc) · 1.08 KB
/
run_feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#pylint: disable = C0330
'''Run feature extraction'''
import os
import sys
from functools import reduce
import qcrit.extract_features
from qcrit.textual_feature import setup_tokenizers
from download_corpus import download_corpus
from corpus_categories import composite_files, genre_to_files
def main():
'''Main'''
corpus_path = ('tesserae', 'texts', 'grc')
download_corpus(corpus_path)
#'FULL STOP', 'SEMICOLON', 'GREEK QUESTION MARK'
setup_tokenizers(terminal_punctuation=('.', ';', ';'))
if len(sys.argv) > 2 and sys.argv[2] == '-u':
import qcrit.features.universal_features #seemingly unused, but allows the recognition of features
else:
import qcrit.features.ancient_greek_features #seemingly unused, but allows the recognition of features
#Feature extractions
qcrit.extract_features.main(
os.path.join(*corpus_path),
{'tess': qcrit.extract_features.parse_tess},
#Exclude all files of genres not specified. Exclude composite files no matter what
excluded_paths=composite_files,
output_file=None if len(sys.argv) <= 1 else sys.argv[1]
)
if __name__ == '__main__':
main()