Skip to content

Commit

Permalink
Making everything click
Browse files Browse the repository at this point in the history
  • Loading branch information
fginter committed Apr 3, 2015
1 parent 0f7b502 commit 4934a3c
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 55 deletions.
4 changes: 4 additions & 0 deletions abspath.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import os
import sys

print os.path.abspath(sys.argv[1])
188 changes: 140 additions & 48 deletions marmot-tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,38 @@
import traceback
import omorfi_pos as omor

def load_readings(m_readings):
words={} #{wordform -> set of (lemma,pos,feat)}
with codecs.open(m_readings,"r","utf-8") as f:
for line in f:
line=line.rstrip(u'\n')
if not line:
continue
form, lemma, pos, feat=line.split(u'\t')
words.setdefault(form,set()).add((lemma,pos,feat))
return words

def score(ppos,pfeat,pos,feat):
s=0
if ppos==pos:
s+=5
pfeat_set=set(pfeat.split(u"|"))
feat_set=set(feat.split(u"|"))
s+=len(pfeat_set & feat_set)
return s

def best_reading(plemma,ppos,pfeat,readings):
if not readings:
return plemma,ppos,pfeat

best=max(((lemma,pos,feat,(score(ppos,pfeat,pos,feat),-lemma.count(u"#"))) for lemma,pos,feat in readings),key=lambda k:k[3])
if options.hard:
return best[0],best[1],best[2]
elif options.hardpos and ppos in (x[1] for x in readings): ###Uncomment to improve your LAS by 2pp :)
return best[0],best[1],best[2]
else:
return best[0],ppos,pfeat

if __name__=="__main__":
log = logging.getLogger("omorfi")
from optparse import OptionParser
Expand All @@ -18,25 +50,49 @@
parser.add_option("--tempdir", dest="tempdir",action="store",default=".", help="Where temporary files should be kept. Default to current dir.")
parser.add_option("-m", "--model", dest="model",action="store", default=None,help="Fill PLEMMA/PPOS/PFEAT using this marmot model",metavar="MODELFILE")
parser.add_option("--marmot", dest="marmotbin",action="store", default=None,help="marmot .jar file")
parser.add_option("--mreadings",action="store", default=None,help="File with the morphological readings")
parser.add_option("--ud",action="store_true", default=False,help="UD")
parser.add_option("--hard",action="store_true", default=False,help="Use OMorFi hard constraint.")
parser.add_option("--hardpos",action="store_true", default=False,help="Use OMorFi hard constraint if pos matches.")
(options, args) = parser.parse_args()

if options.mreadings:
readings=load_readings(options.mreadings)
else:
readings=None

if options.train:
for line in sys.stdin:
line=unicode(line,"utf-8").strip()
if line.startswith(u"#"):
continue
if not line:
print
continue
cols=line.split(u"\t")
assert len(cols) in (13,14,15)
idx,token,pos,feat=int(cols[0]),cols[1],cols[4],cols[6]
if len(cols)==10: #UD
idx,token,pos,feat=int(cols[0]),cols[1],cols[3],cols[5]
else:
assert len(cols) in (13,14,15)
idx,token,pos,feat=int(cols[0]),cols[1],cols[4],cols[6]
tagList=[None for x in range(17)]
#tagList[0]=pos
if feat!=u"_":
for cat_tag in feat.split(u"|"):
cat,tag=cat_tag.split(u"_",1)
tagList[omor.cat2idx[cat]]=tag
s=omor.hun_taglist2tagstring(tagList)
pos_set=set(omor.hun_possiblepos(token))
if options.ud:
pos_set=set(x[1] for x in readings.get(token,[]))
s=feat
else:
if feat!=u"_":
for cat_tag in feat.split(u"|"):
if u"=" in cat_tag:
cat,tag=cat_tag.split(u"=",1)
else:
cat,tag=cat_tag.split(u"_",1)
if cat not in omor.cat2idx:
print >> sys.stderr, "Unknown cat:", cat
else:
tagList[omor.cat2idx[cat]]=tag
s=omor.hun_taglist2tagstring(tagList)
pos_set=set(omor.hun_possiblepos(token))
#pos_set.add(pos)
marmot_feats=u"#".join(u"POS_"+x for x in sorted(pos_set))
if not marmot_feats:
Expand All @@ -49,14 +105,19 @@
lines=[]
for line in sys.stdin:
line=unicode(line,"utf-8").strip()
if line.startswith(u"#"):
continue
lines.append(line)
cols=line.split(u"\t")
if len(cols)==1:
print >> f
continue
else:
assert len(cols) in (13,14,15)
pos_set=set(omor.hun_possiblepos(cols[1]))
else:
assert len(cols) in (10,13,14,15)
if options.ud:
pos_set=set(x[1] for x in readings.get(cols[1],[]))
else:
pos_set=set(omor.hun_possiblepos(cols[1]))
marmot_feats=u"#".join(u"POS_"+x for x in sorted(pos_set))
if not marmot_feats:
marmot_feats=u"_"
Expand All @@ -83,48 +144,79 @@
log.error("""Did not succeed in launching 'LIBS/%s'. The most common reason for this is that you forgot to run './install.sh'. \n\nGiving up, because the parser cannot run without a tagger."""%(" ".join(args)))
sys.exit(1)


while predictions[-1]==[u''] or not predictions[-1]:
predictions.pop(-1)
while lines[-1]==u'':
lines.pop(-1)

newSent=True
assert len(lines)==len(predictions), (len(lines),len(predictions))
for inLine,pred in zip(lines,predictions):
inCols=inLine.split(u"\t")
if len(inCols)==1:
assert inCols[0]==u""
assert pred==[u""]
print
newSent=True
continue #New sentence starts
assert inCols[1]==pred[1] #Tokens must match
txt=inCols[1]
if omor.is_punct(txt):
plemma,ppos,pfeat=txt,u"Punct",u"_"
elif omor.is_num(txt):
plemma,ppos,pfeat=txt,u"Num",u"_"
else:
tl=u"POS_"+pred[5]
if pred[7]!=u"_":
tl+=u"|"+pred[7]
plemma,ptaglist=omor.hun_tag2omorfi(pred[1],tl) #Find the most plausible reading
omor.fill_ortho(txt,ptaglist)
if txt==u"*null*":
ptaglist[omor.cat2idx[u"OTHER"]]=None
ppos=ptaglist[0]
#Guess proper nouns
# if ppos==u"N" and not newSent and ptaglist[omor.cat2idx[u"CASECHANGE"]]==u"Up" and ptaglist[omor.cat2idx[u"OTHER"]]==u"UNK":
# ptaglist[omor.cat2idx[u"SUBCAT"]]=u"Prop"
pfeat=[]
for cat,tag in zip(omor.cat_list[1:],ptaglist[1:]):
if tag!=None:
pfeat.append(cat+u"_"+tag)
if not pfeat:
pfeat=u"_"
if options.ud:

for inLine,pred in zip(lines,predictions):
inCols=inLine.split(u"\t")
if len(inCols)==1:
assert inCols[0]==u""
assert pred==[u""]
print
newSent=True
continue #New sentence starts
assert inCols[1]==pred[1] #Tokens must match
txt=inCols[1]
ppos=pred[5]
pfeat=pred[7]
plemma,ppos,pfeat=best_reading(txt,ppos,pfeat,readings.get(txt,[]))
if len(inCols)==10:
inCols[2],inCols[3],inCols[5]=plemma,ppos,pfeat
else:
inCols[3],inCols[5],inCols[7]=plemma,ppos,pfeat
print (u"\t".join(inCols)).encode("utf-8")
newSent=False

else:

for inLine,pred in zip(lines,predictions):
inCols=inLine.split(u"\t")
if len(inCols)==1:
assert inCols[0]==u""
assert pred==[u""]
print
newSent=True
continue #New sentence starts
assert inCols[1]==pred[1] #Tokens must match
txt=inCols[1]
# if omor.is_punct(txt):
# plemma,ppos,pfeat=txt,u"Punct",u"_"
# elif omor.is_num(txt):
# plemma,ppos,pfeat=txt,u"Num",u"_"
if True: #
tl=u"POS_"+pred[5]
if pred[7]!=u"_":
tl+=u"|"+pred[7]
plemma,ptaglist=omor.hun_tag2omorfi(pred[1],tl) #Find the ost plausible reading
omor.fill_ortho(txt,ptaglist)
if txt==u"*null*":
ptaglist[omor.cat2idx[u"OTHER"]]=None
if options.hard:
ppos=ptaglist[0]
#Guess proper nouns
# if ppos==u"N" and not newSent and ptaglist[omor.cat2idx[u"CASECHANGE"]]==u"Up" and ptaglist[omor.cat2idx[u"OTHER"]]==u"UNK":
# ptaglist[omor.cat2idx[u"SUBCAT"]]=u"Prop"
pfeat=[]
for cat,tag in zip(omor.cat_list[1:],ptaglist[1:]):
if tag!=None:
pfeat.append(cat+u"="+tag)
if not pfeat:
pfeat=u"_"
else:
pfeat=u"|".join(pfeat)
else:
#soft
ppos=pred[5]
pfeat=re.sub(ur"((^|\|)[A-Z]+)_([a-zA-Z0-9])",ur"\1=\3",pred[7],re.U)
if len(inCols)==10:
inCols[2],inCols[3],inCols[5]=plemma.replace(u"|",u"#"),ppos,pfeat
else:
pfeat=u"|".join(pfeat)
inCols[3],inCols[5],inCols[7]=plemma,ppos,pfeat
print (u"\t".join(inCols)).encode("utf-8")
newSent=False
inCols[3],inCols[5],inCols[7]=plemma.replace(u"|",u"#"),ppos,pfeat
print (u"\t".join(inCols)).encode("utf-8")
newSent=False
Binary file modified model/parser.model
Binary file not shown.
6 changes: 3 additions & 3 deletions morpho-sd2ud/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ set -u
set -e
shopt -s failglob

INPUT=input.txt
OUTPUT=output.txt
INPUT=$2
OUTPUT=$3

WORKDIR=work
WORKDIR=$1

# clean up possible previous versions
rm -rf $OUTPUT
Expand Down
9 changes: 6 additions & 3 deletions omorfi_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def analyze_taglist(tags,retForm=RET_LIST):
return pos,tagString
elif retForm==RET_POS_FEAT_TDT:
pos=ct_dict[u"POS"]
tagString=u"|".join(cat+u"_"+ct_dict[cat] for cat in cat_list if cat!=u"POS" and cat in ct_dict)
tagString=u"|".join(cat+u"="+ct_dict[cat] for cat in cat_list if cat!=u"POS" and cat in ct_dict)
return pos,tagString

UNIQUE_NOLEMMA=0
Expand Down Expand Up @@ -468,13 +468,16 @@ def hun_tag2omorfi(token,tagString):
continue
readings=omorfi_lookup(token)
if not readings:
print (token+u"\t+?").encode(u"utf-8")
#print (token+u"\t+?").encode(u"utf-8")
continue
for r in readings:
if options.orig==False:
try:
lemma,taglist=analyze_reading(r)
pos,tags=analyze_taglist(taglist,retForm=RET_POS_FEAT_TDT)
print (token+u"\t"+lemma).encode(u"utf-8"),pos.encode(u"utf-8"),tags.encode(u"utf-8")
if not tags:
tags=u"_"
print (u"%s\t%s\t%s\t%s"%(token,lemma,pos,tags)).encode("utf-8")
except:
pass

Expand Down
9 changes: 8 additions & 1 deletion tag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@

source init.sh

$PYTHON marmot-tag.py --marmot $THIS/LIBS/marmot.jar --tempdir $TMPDIR -m model/fin_model.marmot > $TMPDIR/input_tagged_1.conll09
cat > $TMPDIR/tagger_input.conll09
cat $TMPDIR/tagger_input.conll09 | cut -f 2 | sort | uniq | python omorfi_pos.py > $TMPDIR/all_readings.sd
rm -rf $TMPDIR/morpho_conv_tmp
TMPDIR_ABS=$($PYTHON abspath.py $TMPDIR)
cd morpho-sd2ud
./run.sh $TMPDIR_ABS/morpho_conv_tmp $TMPDIR_ABS/all_readings.sd $TMPDIR_ABS/all_readings.ud
cd ..
cat $TMPDIR/tagger_input.conll09 | $PYTHON marmot-tag.py --marmot $THIS/LIBS/marmot.jar --tempdir $TMPDIR --ud --hardpos --mreadings $TMPDIR/all_readings.ud -m model/fin_model.marmot > $TMPDIR/input_tagged_1.conll09

if [[ $? -ne 0 ]]
then
Expand Down

0 comments on commit 4934a3c

Please sign in to comment.