Skip to content

Commit

Permalink
Conll 09 or u goes in, u goes out
Browse files Browse the repository at this point in the history
  • Loading branch information
fginter committed Apr 3, 2015
1 parent 9467f3d commit 927375e
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 2 deletions.
38 changes: 38 additions & 0 deletions conv_u_09.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import sys
import argparse

ID,FORM,LEMMA,UCPOS,UPOS,UFEAT,UHEAD,UDEPREL,UDEPS,UMISC=range(10)
ID,FORM,LEMMA,PLEMMA,POS,PPOS,FEAT,PFEAT,HEAD,PHEAD,DEPREL,PDEPREL=range(12)

if __name__=="__main__":
parser = argparse.ArgumentParser(description='Convert conllu to conll09 and back. Infers the direction on its own if no arguments given.')
parser.add_argument('--output-format', default=None, help='Output format can be "u" or "09". If the input is in this format already, the conversion is a no-op and simply passes data through.')
args = parser.parse_args()

for line in sys.stdin:
line=line.strip()
if not line:
print
elif line.startswith('#'):
print line
else:
cols=line.split('\t')
if len(cols)==10:
#UD in
if args.output_format=="u":
#UD out, no-op
print '\t'.join(cols)
else:
#UD -> 09
print '\t'.join((cols[ID],cols[FORM],cols[LEMMA],cols[LEMMA],cols[UCPOS],cols[UCPOS],cols[UFEAT],cols[UFEAT],cols[UHEAD],cols[UHEAD],cols[UDEPREL],cols[UDEPREL],'_','_'))
else:
#09 in
assert len(cols) in (12,13,14), cols
if args.output_format=="09":
#09 out, no-op
print '\t'.join(cols)
else:
#09 -> UD
print '\t'.join((cols[ID],cols[FORM],cols[PLEMMA],cols[PPOS],'_',cols[PFEAT],cols[PHEAD],cols[PDEPREL],'_','_'))


4 changes: 2 additions & 2 deletions parse_conll.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fi

#2) remove comments, tag the input, and fill in lemmas
# MAX_SEN_LEN and SEN_CHUNK are defined in init.sh
cat | $PYTHON limit_sentence_len.py -N $MAX_SEN_LEN -C $SEN_CHUNK | $PYTHON store_comments.py -d $TMPDIR/comments_parser.json | ./tag.sh > $TMPDIR/input_tagged.conll09
cat | python conv_u_09.py --output=09 | $PYTHON limit_sentence_len.py -N $MAX_SEN_LEN -C $SEN_CHUNK | $PYTHON store_comments.py -d $TMPDIR/comments_parser.json | ./tag.sh > $TMPDIR/input_tagged.conll09

if [[ $? -ne 0 ]]
then
Expand All @@ -27,7 +27,7 @@ cat $TMPDIR/input_tagged.conll09 | ./parse.sh > $TMPDIR/input_parsed.conll09

#4) add comments

cat $TMPDIR/input_parsed.conll09 | $PYTHON add_comments.py -d $TMPDIR/comments_parser.json | $PYTHON limit_sentence_len.py --reverse
cat $TMPDIR/input_parsed.conll09 | $PYTHON add_comments.py -d $TMPDIR/comments_parser.json | $PYTHON limit_sentence_len.py --reverse | python conv_u_09.py --output=u

#5) delete the temporary directory

Expand Down

0 comments on commit 927375e

Please sign in to comment.