Making everything click

TurkuNLP · Apr 3, 2015 · 4934a3c · 4934a3c
1 parent 0f7b502
commit 4934a3c
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 55 deletions.
diff --git a/abspath.py b/abspath.py
@@ -0,0 +1,4 @@
+import os
+import sys
+
+print os.path.abspath(sys.argv[1])
diff --git a/marmot-tag.py b/marmot-tag.py
@@ -10,6 +10,38 @@
 import traceback
 import omorfi_pos as omor
 
+def load_readings(m_readings):
+    words={} #{wordform -> set of (lemma,pos,feat)}
+    with codecs.open(m_readings,"r","utf-8") as f:
+        for line in f:
+            line=line.rstrip(u'\n')
+            if not line:
+                continue
+            form, lemma, pos, feat=line.split(u'\t')
+            words.setdefault(form,set()).add((lemma,pos,feat))
+    return words
+
+def score(ppos,pfeat,pos,feat):
+    s=0
+    if ppos==pos:
+        s+=5
+    pfeat_set=set(pfeat.split(u"|"))
+    feat_set=set(feat.split(u"|"))
+    s+=len(pfeat_set & feat_set)
+    return s
+
+def best_reading(plemma,ppos,pfeat,readings):
+    if not readings:
+        return plemma,ppos,pfeat
+
+    best=max(((lemma,pos,feat,(score(ppos,pfeat,pos,feat),-lemma.count(u"#"))) for lemma,pos,feat in readings),key=lambda k:k[3])
+    if options.hard: 
+        return best[0],best[1],best[2]
+    elif options.hardpos and ppos in (x[1] for x in readings): ###Uncomment to improve your LAS by 2pp :)
+        return best[0],best[1],best[2]
+    else:
+        return best[0],ppos,pfeat
+
 if __name__=="__main__":
     log = logging.getLogger("omorfi")
     from optparse import OptionParser
@@ -18,25 +50,49 @@
     parser.add_option("--tempdir", dest="tempdir",action="store",default=".", help="Where temporary files should be kept. Default to current dir.")
     parser.add_option("-m", "--model", dest="model",action="store", default=None,help="Fill PLEMMA/PPOS/PFEAT using this marmot model",metavar="MODELFILE")
     parser.add_option("--marmot", dest="marmotbin",action="store", default=None,help="marmot .jar file")
+    parser.add_option("--mreadings",action="store", default=None,help="File with the morphological readings")
+    parser.add_option("--ud",action="store_true", default=False,help="UD")
+    parser.add_option("--hard",action="store_true", default=False,help="Use OMorFi hard constraint.")
+    parser.add_option("--hardpos",action="store_true", default=False,help="Use OMorFi hard constraint if pos matches.")
     (options, args) = parser.parse_args()
 
+    if options.mreadings:
+        readings=load_readings(options.mreadings)
+    else:
+        readings=None
+
     if options.train:
         for line in sys.stdin:
             line=unicode(line,"utf-8").strip()
+            if line.startswith(u"#"):
+                continue
             if not line:
                 print
                 continue
             cols=line.split(u"\t")
-            assert len(cols) in (13,14,15)
-            idx,token,pos,feat=int(cols[0]),cols[1],cols[4],cols[6]
+            if len(cols)==10: #UD
+                idx,token,pos,feat=int(cols[0]),cols[1],cols[3],cols[5]
+            else:
+                assert len(cols) in (13,14,15)
+                idx,token,pos,feat=int(cols[0]),cols[1],cols[4],cols[6]
             tagList=[None for x in range(17)]
             #tagList[0]=pos
-            if feat!=u"_":
-                for cat_tag in feat.split(u"|"):
-                    cat,tag=cat_tag.split(u"_",1)
-                    tagList[omor.cat2idx[cat]]=tag
-            s=omor.hun_taglist2tagstring(tagList)
-            pos_set=set(omor.hun_possiblepos(token))
+            if options.ud:
+                pos_set=set(x[1] for x in readings.get(token,[]))
+                s=feat
+            else:
+                if feat!=u"_":
+                    for cat_tag in feat.split(u"|"):
+                        if u"=" in cat_tag:
+                            cat,tag=cat_tag.split(u"=",1)
+                        else:
+                            cat,tag=cat_tag.split(u"_",1)
+                        if cat not in omor.cat2idx:
+                            print >> sys.stderr, "Unknown cat:", cat
+                        else:
+                            tagList[omor.cat2idx[cat]]=tag
+                s=omor.hun_taglist2tagstring(tagList)
+                pos_set=set(omor.hun_possiblepos(token))
             #pos_set.add(pos)
             marmot_feats=u"#".join(u"POS_"+x for x in sorted(pos_set))
             if not marmot_feats:
@@ -49,14 +105,19 @@
         lines=[]
         for line in sys.stdin:
             line=unicode(line,"utf-8").strip()
+            if line.startswith(u"#"):
+                continue
             lines.append(line)
             cols=line.split(u"\t")
             if len(cols)==1:
                 print >> f
                 continue
-            else:
-                assert len(cols) in (13,14,15)
-                pos_set=set(omor.hun_possiblepos(cols[1]))
+            else:                
+                assert len(cols) in (10,13,14,15)
+                if options.ud:
+                    pos_set=set(x[1] for x in readings.get(cols[1],[]))
+                else:
+                    pos_set=set(omor.hun_possiblepos(cols[1]))
                 marmot_feats=u"#".join(u"POS_"+x for x in sorted(pos_set))
                 if not marmot_feats:
                     marmot_feats=u"_"
@@ -83,48 +144,79 @@
             log.error("""Did not succeed in launching 'LIBS/%s'. The most common reason for this is that you forgot to run './install.sh'. \n\nGiving up, because the parser cannot run without a tagger."""%(" ".join(args)))
             sys.exit(1)
 
-
         while predictions[-1]==[u''] or not predictions[-1]:
             predictions.pop(-1)
         while lines[-1]==u'':
             lines.pop(-1)
 
         newSent=True
         assert len(lines)==len(predictions), (len(lines),len(predictions))
-        for inLine,pred in zip(lines,predictions):
-            inCols=inLine.split(u"\t")
-            if len(inCols)==1:
-                assert inCols[0]==u""
-                assert pred==[u""]
-                print
-                newSent=True
-                continue #New sentence starts
-            assert inCols[1]==pred[1] #Tokens must match
-            txt=inCols[1]
-            if omor.is_punct(txt):
-                plemma,ppos,pfeat=txt,u"Punct",u"_"
-            elif omor.is_num(txt):
-                plemma,ppos,pfeat=txt,u"Num",u"_"
-            else:
-                tl=u"POS_"+pred[5]
-                if pred[7]!=u"_":
-                    tl+=u"|"+pred[7]
-                plemma,ptaglist=omor.hun_tag2omorfi(pred[1],tl) #Find the most plausible reading
-                omor.fill_ortho(txt,ptaglist)
-                if txt==u"*null*":
-                    ptaglist[omor.cat2idx[u"OTHER"]]=None
-                ppos=ptaglist[0]
-                #Guess proper nouns
-#                if ppos==u"N" and not newSent and ptaglist[omor.cat2idx[u"CASECHANGE"]]==u"Up" and ptaglist[omor.cat2idx[u"OTHER"]]==u"UNK":
-#                    ptaglist[omor.cat2idx[u"SUBCAT"]]=u"Prop"
-                pfeat=[]
-                for cat,tag in zip(omor.cat_list[1:],ptaglist[1:]):
-                    if tag!=None:
-                        pfeat.append(cat+u"_"+tag)
-                if not pfeat:
-                    pfeat=u"_"
+        if options.ud:
+
+            for inLine,pred in zip(lines,predictions):
+                inCols=inLine.split(u"\t")
+                if len(inCols)==1:
+                    assert inCols[0]==u""
+                    assert pred==[u""]
+                    print
+                    newSent=True
+                    continue #New sentence starts
+                assert inCols[1]==pred[1] #Tokens must match
+                txt=inCols[1]
+                ppos=pred[5]
+                pfeat=pred[7]
+                plemma,ppos,pfeat=best_reading(txt,ppos,pfeat,readings.get(txt,[]))
+                if len(inCols)==10:
+                    inCols[2],inCols[3],inCols[5]=plemma,ppos,pfeat
+                else:
+                    inCols[3],inCols[5],inCols[7]=plemma,ppos,pfeat
+                print (u"\t".join(inCols)).encode("utf-8")
+                newSent=False
+
+        else:
+
+            for inLine,pred in zip(lines,predictions):
+                inCols=inLine.split(u"\t")
+                if len(inCols)==1:
+                    assert inCols[0]==u""
+                    assert pred==[u""]
+                    print
+                    newSent=True
+                    continue #New sentence starts
+                assert inCols[1]==pred[1] #Tokens must match
+                txt=inCols[1]
+#                if omor.is_punct(txt):
+#                    plemma,ppos,pfeat=txt,u"Punct",u"_"
+#                elif omor.is_num(txt):
+#                    plemma,ppos,pfeat=txt,u"Num",u"_"
+                if True: #
+                    tl=u"POS_"+pred[5]
+                    if pred[7]!=u"_":
+                        tl+=u"|"+pred[7]
+                    plemma,ptaglist=omor.hun_tag2omorfi(pred[1],tl) #Find the ost plausible reading
+                    omor.fill_ortho(txt,ptaglist)
+                    if txt==u"*null*":
+                        ptaglist[omor.cat2idx[u"OTHER"]]=None
+                    if options.hard:
+                        ppos=ptaglist[0]
+                        #Guess proper nouns
+        #                if ppos==u"N" and not newSent and ptaglist[omor.cat2idx[u"CASECHANGE"]]==u"Up" and ptaglist[omor.cat2idx[u"OTHER"]]==u"UNK":
+        #                    ptaglist[omor.cat2idx[u"SUBCAT"]]=u"Prop"
+                        pfeat=[]
+                        for cat,tag in zip(omor.cat_list[1:],ptaglist[1:]):
+                            if tag!=None:
+                                pfeat.append(cat+u"="+tag)
+                        if not pfeat:
+                            pfeat=u"_"
+                        else:
+                            pfeat=u"|".join(pfeat)
+                    else:
+                        #soft 
+                        ppos=pred[5]
+                        pfeat=re.sub(ur"((^|\|)[A-Z]+)_([a-zA-Z0-9])",ur"\1=\3",pred[7],re.U)
+                if len(inCols)==10:
+                    inCols[2],inCols[3],inCols[5]=plemma.replace(u"|",u"#"),ppos,pfeat
                 else:
-                    pfeat=u"|".join(pfeat)
-            inCols[3],inCols[5],inCols[7]=plemma,ppos,pfeat
-            print (u"\t".join(inCols)).encode("utf-8")
-            newSent=False
+                    inCols[3],inCols[5],inCols[7]=plemma.replace(u"|",u"#"),ppos,pfeat
+                print (u"\t".join(inCols)).encode("utf-8")
+                newSent=False
diff --git a/model/parser.model b/model/parser.model
diff --git a/morpho-sd2ud/run.sh b/morpho-sd2ud/run.sh
@@ -4,10 +4,10 @@ set -u
 set -e
 shopt -s failglob
 
-INPUT=input.txt
-OUTPUT=output.txt
+INPUT=$2
+OUTPUT=$3
 
-WORKDIR=work
+WORKDIR=$1
 
 # clean up possible previous versions
 rm -rf $OUTPUT

diff --git a/omorfi_pos.py b/omorfi_pos.py
@@ -171,7 +171,7 @@ def analyze_taglist(tags,retForm=RET_LIST):
         return pos,tagString
     elif retForm==RET_POS_FEAT_TDT:
         pos=ct_dict[u"POS"]
-        tagString=u"|".join(cat+u"_"+ct_dict[cat] for cat in cat_list if cat!=u"POS" and cat in ct_dict)
+        tagString=u"|".join(cat+u"="+ct_dict[cat] for cat in cat_list if cat!=u"POS" and cat in ct_dict)
         return pos,tagString
 
 UNIQUE_NOLEMMA=0
@@ -468,13 +468,16 @@ def hun_tag2omorfi(token,tagString):
                 continue
             readings=omorfi_lookup(token)
             if not readings:
-                print (token+u"\t+?").encode(u"utf-8")
+                #print (token+u"\t+?").encode(u"utf-8")
+                continue
             for r in readings:
                 if options.orig==False:
                     try:
                         lemma,taglist=analyze_reading(r)
                         pos,tags=analyze_taglist(taglist,retForm=RET_POS_FEAT_TDT)
-                        print (token+u"\t"+lemma).encode(u"utf-8"),pos.encode(u"utf-8"),tags.encode(u"utf-8")
+                        if not tags:
+                            tags=u"_"
+                        print (u"%s\t%s\t%s\t%s"%(token,lemma,pos,tags)).encode("utf-8")
                     except:
                         pass
 

diff --git a/tag.sh b/tag.sh
@@ -5,7 +5,14 @@
 
 source init.sh
 
-$PYTHON marmot-tag.py --marmot $THIS/LIBS/marmot.jar --tempdir $TMPDIR -m model/fin_model.marmot > $TMPDIR/input_tagged_1.conll09
+cat > $TMPDIR/tagger_input.conll09
+cat $TMPDIR/tagger_input.conll09 | cut -f 2 | sort | uniq | python omorfi_pos.py > $TMPDIR/all_readings.sd
+rm -rf $TMPDIR/morpho_conv_tmp
+TMPDIR_ABS=$($PYTHON abspath.py $TMPDIR)
+cd morpho-sd2ud
+./run.sh $TMPDIR_ABS/morpho_conv_tmp $TMPDIR_ABS/all_readings.sd $TMPDIR_ABS/all_readings.ud
+cd ..
+cat $TMPDIR/tagger_input.conll09 | $PYTHON marmot-tag.py --marmot $THIS/LIBS/marmot.jar --tempdir $TMPDIR --ud --hardpos --mreadings $TMPDIR/all_readings.ud -m model/fin_model.marmot > $TMPDIR/input_tagged_1.conll09
 
 if [[ $? -ne 0 ]]
 then