Skip to content

Commit

Permalink
add CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
shovalsa committed Dec 28, 2019
1 parent 849b7e1 commit 2871aef
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 21 deletions.
29 changes: 17 additions & 12 deletions ud_to_spmrl.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,23 @@
"PUNCT": {
"default": "yyUNK",
"exceptions": [
[["FORM", ":"], "yyCLN"],
[["FORM", ","], "yyCM"],
[["FORM", "-"], "yyDASH"],
[["FORM", "."], "yyDOT"],
[["FORM", "("], "yyLRB"],
[["FORM", ")"], "yyRRB"],
[["FORM", ";"], "yySCLN"],
[["FORM", "\""], "yyQUOT"],
[["FORM", "!"], "yyEXCL"],
[["FORM", "?"], "yyQM"],
[["FORM", "..."], "yyELPS"]

[["FORM", ":"], "yyCLN"],
[["FORM", ","], "yyCM"],
[["FORM", "-"], "yyDASH"],
[["FORM", "."], "yyDOT"],
[["FORM", "("], "yyLRB"],
[["FORM", ")"], "yyRRB"],
[["FORM", ";"], "yySCLN"],
[["FORM", "\""], "yyQUOT"],
[["FORM", "'"], "yyQUOT"],
[["FORM", "!"], "yyEXCL"],
[["FORM", "?"], "yyQM"],
[["FORM", "..."], "yyELPS"],
[["FORM", "\\"], "yySLASH"],
[["FORM", "/"], "yySLASH"],
[["FORM", ""], "yyDASH"],
[["FORM", "-"], "yyDASH"],
[["FORM", "*"], "yySTAR"]
]},
"SYM": {
"default": "yySYM",
Expand Down
25 changes: 16 additions & 9 deletions ud_to_spmrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import re
import numpy as np
from tqdm import tqdm
import argparse

def add_token_numbers_to_file(filepath):

def add_token_numbers_to_file(filepath, tokenized_filepath):
"""based on yochai's code"""
# original format + original_token column
columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'TOKEN_NUMBER']

current_token = 0
num_of_broken_tokens = 0
tokenized_filepath = filepath.replace('.conllu', '_tokenized.conllu')
with open(filepath, 'r') as conll_file, open(tokenized_filepath, 'wt') as tokenized_file:
# create similiar tsv format with the new columns
tokenized_file.write('\t'.join(columns) + '\n')
Expand Down Expand Up @@ -93,7 +94,8 @@ def convert_features(row, features):
try:
value = row_feature.split("=")[1]
except IndexError:
raise Exception([row["ID"], row["FEATS"], row["XPOS"]])
pass
# raise Exception([row["ID"], row["FEATS"], row["XPOS"]])
if key in features:
new_name = features[key]["spmrl_name"]
if value in features[key]["single_values"]:
Expand Down Expand Up @@ -157,16 +159,21 @@ def main(df, conversion_json, include_upos=False):


if __name__ == "__main__":
conll_path = "./data/new_datasets/academia_sep_1.conllu"
conversion_json_filepath = "./ud_to_spmrl.json"
conllu_for_yap = conll_path.replace(".conllu", "_converted.conllu")
parser = argparse.ArgumentParser(description="conversion from UD annotation to SPMRL (YAP readable format).")

parser.add_argument("--ud_filepath", help="Obligatory - path to file with manually tagged dataset (in UD)")
parser.add_argument("--conversion_rules", default="./ud_to_spmrl.json", help="path to json file with conversion rules.")

tokenized_filepath = add_token_numbers_to_file(conll_path)
# tokenized = create_tokenized(conll_path)
argv = parser.parse_args()

# conll_path = "./data/new_datasets/academia_sep_1.conllu"
# conversion_rules = "./ud_to_spmrl.json"
tokenized_filepath = argv.ud_filepath.replace('.conllu', '_tokenized.conllu')
add_token_numbers_to_file(argv.ud_filepath, tokenized_filepath)
conllu = pd.read_csv(tokenized_filepath, sep="\t", comment="#", skip_blank_lines=False)

conllu = main(conllu, conversion_json_filepath)
conllu = main(conllu, argv.conversion_rules)
conllu_for_yap = argv.ud_filepath.replace(".conllu", "_converted.conllu")
output_to_file(conllu, conllu_for_yap)
# conllu.to_csv(conllu_for_yap, sep="\t", index=False,
# header=False, quoting=csv.QUOTE_NONE, escapechar="\\")

0 comments on commit 2871aef

Please sign in to comment.