diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..a3f979dbf --- /dev/null +++ b/.gitignore @@ -0,0 +1,221 @@ +example_extff/ff_example.lo +example_extff/libff_example.la +mteval/meteor_jar.cc +training/utils/grammar_convert +*.a +*.trs +*.aux +*.bbl +*.blg +*.dvi +*.idx +*.log +*.o +*.pdf +*.ps +*.pyc +*.so +*.toc +*swp +*~ +.* +./cdec/ +Makefile +Makefile.in +aclocal.m4 +autom4te.cache/ +config.guess +config.h +config.h.in +config.h.in~ +config.log +config.status +config.sub +configure +decoder/Makefile +decoder/Makefile.in +decoder/bin/ +decoder/cdec +decoder/dict_test +decoder/sv_test +decoder/ff_test +decoder/grammar_test +decoder/hg_test +decoder/logval_test +decoder/parser_test +decoder/rule_lexer.cc +decoder/small_vector_test +decoder/trule_test +decoder/weights_test +depcomp +dist +dpmert/Makefile +dpmert/Makefile.in +dpmert/fast_score +dpmert/lo_test +dpmert/mr_dpmert_generate_mapper_input +dpmert/mr_dpmert_map +dpmert/mr_dpmert_reduce +dpmert/scorer_test +dpmert/sentclient +dpmert/sentserver +dpmert/union_forests +dtrain/dtrain +extools/build_lexical_translation +extools/extractor +extools/extractor_monolingual +extools/featurize_grammar +extools/filter_grammar +extools/filter_score_grammar +extools/mr_stripe_rule_reduce +extools/score_grammar +extools/sg_lexer.cc +extractor/*_test +extractor/compile +extractor/extract +extractor/run_extractor +gi/clda/src/clda +gi/markov_al/ml +gi/pf/align-lexonly +gi/pf/align-lexonly-pyp +gi/pf/align-tl +gi/pf/bayes_lattice_score +gi/pf/brat +gi/pf/cbgi +gi/pf/condnaive +gi/pf/dpnaive +gi/pf/itg +gi/pf/learn_cfg +gi/pf/nuisance_test +gi/pf/pf_test +gi/pf/pfbrat +gi/pf/pfdist +gi/pf/pfnaive +gi/pf/pyp_lm +gi/posterior-regularisation/prjava/build/ +gi/posterior-regularisation/prjava/lib/*.jar +gi/posterior-regularisation/prjava/lib/prjava-20100713.jar +gi/posterior-regularisation/prjava/lib/prjava-20100715.jar +gi/posterior-regularisation/prjava/prjava.jar +gi/pyp-topics/src/contexts_lexer.cc +gi/pyp-topics/src/pyp-contexts-train +gi/pyp-topics/src/pyp-topics-train +install-sh +jam-files/bjam +jam-files/engine/bin.* +jam-files/engine/bootstrap/ +klm/lm/bin/ +klm/lm/builder/builder +klm/lm/builder/lmplz +klm/lm/build_binary +klm/lm/ngram_query +klm/lm/query +klm/util/bin/ +libtool +ltmain.sh +m4/libtool.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/ltversion.m4 +m4/lt~obsolete.m4 +minrisk/minrisk_optimize +mira/kbest_mira +missing +mteval/bin/ +mteval/fast_score +mteval/mbr_kbest +mteval/scorer_test +phrasinator/gibbs_train_plm +phrasinator/gibbs_train_plm_notables +previous.sh +pro-train/mr_pro_map +pro-train/mr_pro_reduce +python/build +python/setup.py +rampion/rampion_cccp +rst_parser/mst_train +rst_parser/random_tree +rst_parser/rst_parse +rst_parser/rst_train +sa-extract/calignment.c +sa-extract/cdat.c +sa-extract/cfloatlist.c +sa-extract/cintlist.c +sa-extract/clex.c +sa-extract/cstrmap.c +sa-extract/csuf.c +sa-extract/cveb.c +sa-extract/lcp.c +sa-extract/precomputation.c +sa-extract/rule.c +sa-extract/rulefactory.c +sa-extract/sym.c +stamp-h1 +tests/system_tests/hmm/foo.src +training/Makefile +training/Makefile.in +training/atools +training/augment_grammar +training/cllh_filter_grammar +training/collapse_weights +training/grammar_convert +training/lbfgs_test +training/lbl_model +training/liblbfgs/bin/ +training/liblbfgs/ll_test +training/model1 +training/mpi_batch_optimize +training/mpi_adagrad_optimize +training/mpi_compute_cllh +training/mpi_em_optimize +training/mpi_extract_features +training/mpi_extract_reachable +training/mpi_flex_optimize +training/mpi_online_optimize +training/mr_em_adapted_reduce +training/mr_em_map_adapter +training/mr_optimize_reduce +training/mr_reduce_to_weights +training/optimize_test +training/plftools +training/test_ngram +utils/atools +utils/bin/ +utils/crp_test +utils/dict_test +utils/logval_test +utils/m_test +utils/mfcr_test +utils/phmt +utils/reconstruct_weights +utils/small_vector_test +utils/sv_test +utils/ts +utils/weights_test +training/crf/mpi_adagrad_optimize +training/crf/mpi_batch_optimize +training/crf/mpi_baum_welch +training/crf/mpi_compute_cllh +training/crf/mpi_extract_features +training/crf/mpi_extract_reachable +training/crf/mpi_flex_optimize +training/crf/mpi_online_optimize +training/dpmert/lo_test +training/dpmert/mr_dpmert_generate_mapper_input +training/dpmert/mr_dpmert_map +training/dpmert/mr_dpmert_reduce +training/dpmert/sentclient +training/dpmert/sentserver +training/dtrain/dtrain +training/latent_svm/latent_svm +training/minrisk/minrisk_optimize +training/mira/kbest_mira +training/mira/kbest_cut_mira +training/pro/mr_pro_map +training/pro/mr_pro_reduce +training/rampion/rampion_cccp +training/utils/lbfgs_test +training/utils/optimize_test +training/utils/sentclient +training/utils/sentserver +word-aligner/fast_align diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..1f0f2eeef --- /dev/null +++ b/.travis.yml @@ -0,0 +1,23 @@ +language: python +python: + - "2.7" +before_script: + - sudo apt-get install libboost-filesystem1.48-dev + - sudo apt-get install libboost-program-options1.48-dev + - sudo apt-get install libboost-serialization1.48-dev + - sudo apt-get install libboost-regex1.48-dev + - sudo apt-get install libboost-test1.48-dev + - sudo apt-get install libboost-system1.48-dev + - sudo apt-get install libboost-thread1.48-dev + - sudo apt-get install flex + - autoreconf -ifv + - ./configure +script: + - make + - cd python + - python setup.py install + - cd .. +after_script: + - make check + - ./tests/run-system-tests.pl + - nosetests python/tests diff --git a/BUILDING b/BUILDING new file mode 100644 index 000000000..d5a086e89 --- /dev/null +++ b/BUILDING @@ -0,0 +1,40 @@ +To build cdec, you'll need: + + * boost headers & boost program_options (you may need to install a package + like boost-devel) + + +Instructions for building +----------------------------------- + + 1) Use automake / autoconf to generate the configure script. + I'm not an expert at using these tools, but this should be sufficient: + + autoreconf -ifv + + 2) Configure and build. Your command will look something like this. + + ./configure + make + + If you get errors during configure about missing BOOST macros, then step 3 + failed, and you need to keep working at it. If you get errors during the + build, it's probably a problem with step 3 or possibly with some compiler + version idiosyncracies (generally, I assume you have a relatively new + of g++). + + If you're building on cygwin, their libtool is buggy; this make command + works for now: + + make LIBS+="-lz -lboost_program_options" \ + CFLAGS+="-Wno-sign-compare" + + 3) Test + + ./tests/run-system-tests.pl + + Everything should pass. + + + 4) Enjoy! + diff --git a/LICENSE.cctbx.txt b/LICENSE.cctbx.txt new file mode 100644 index 000000000..a8d9a4943 --- /dev/null +++ b/LICENSE.cctbx.txt @@ -0,0 +1,45 @@ +*** License agreement *** + +cctbx Copyright (c) 2006, The Regents of the University of +California, through Lawrence Berkeley National Laboratory (subject to +receipt of any required approvals from the U.S. Dept. of Energy). All +rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +(1) Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +(2) Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +(3) Neither the name of the University of California, Lawrence Berkeley +National Laboratory, U.S. Dept. of Energy nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You are under no obligation whatsoever to provide any bug fixes, +patches, or upgrades to the features, functionality or performance of +the source code ("Enhancements") to anyone; however, if you choose to +make your Enhancements available either publicly, or directly to +Lawrence Berkeley National Laboratory, without imposing a separate +written license agreement for such Enhancements, then you hereby grant +the following license: a non-exclusive, royalty-free perpetual license +to install, use, modify, prepare derivative works, incorporate into +other computer software, distribute, and sublicense such enhancements or +derivative works thereof, in binary and source code form. + diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 000000000..a390938bc --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,213 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +---------------------------------------------- + +L-BFGS CODE FROM COMPUTATIONAL CRYSTALLOGRAPHY TOOLBOX (CCTBX) + +This package includes source code (training/lbfgs.h) based on source +code distributed as part of the Compational Crystallography Toolbox +(CCTBX), which has separate copyright notices and license terms. Use of +this source code is subject to the terms and conditions of the license +contained in the file LICENSE.cctbx . + diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 000000000..88327477d --- /dev/null +++ b/Makefile.am @@ -0,0 +1,24 @@ +# warning - the subdirectories in the following list should +# be kept in topologically sorted order. Also, DO NOT introduce +# cyclic dependencies between these directories! +SUBDIRS = \ + utils \ + klm/util/double-conversion \ + klm/util \ + klm/util/stream \ + klm/lm \ + klm/lm/builder \ + klm/search \ + mteval \ + decoder \ + training \ + word-aligner \ + extractor \ + example_extff + + +EXTRA_DIST = corpus tests python/cdec python/tests python/examples compound-split environment +AUTOMAKE_OPTIONS = foreign +ACLOCAL_AMFLAGS = -I m4 +AM_CPPFLAGS = -D_GLIBCXX_PARALLEL -march=native -mtune=native -O2 -pipe -fomit-frame-pointer -Wall + diff --git a/README.md b/README.md new file mode 100644 index 000000000..3cbc62c36 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +`cdec` is a research platform for machine translation and similar structured prediction problems. + +[![Build Status](https://travis-ci.org/redpony/cdec.svg?branch=master)](https://travis-ci.org/redpony/cdec) + +## System requirements + +- A Linux or Mac OS X system +- A C++ compiler implementing the [C++-11 standard](http://www.stroustrup.com/C++11FAQ.html) (NEW) + - Unfortunately, many systems have compilers that predate C++-11 support. + - You may need to build your own C++ compiler or upgrade your operating system's. +- [Boost C++ libraries (version 1.44 or later)](http://www.boost.org/) + - If you build your own boost, you _must install it_ using `bjam install`. + - Older versions of Boost _may_ work, but problems have been reported with command line option parsing on some platforms with older versions. +- [GNU Flex](http://flex.sourceforge.net/) + +## Building from a downloaded archive + +If your system contains the required tools and libraries in the usual places, you should be able to build as simply as: + + ./configure + make -j4 + ./tests/run-system-tests.pl + +## Building from a git clone + +In addition to the standard `cdec` third party software requirements, you will additionally need the following software to work with the `cdec` source code directly from git: + +- [Autoconf / Automake / Libtool](http://www.gnu.org/software/autoconf/) + - Older versions of GNU autotools may not work properly. + +Instructions: + + autoreconf -ifv + ./configure + make -j4 + ./tests/run-system-tests.pl + +## Further information + +[For more information, refer to the `cdec` documentation](http://www.cdec-decoder.org) + +## Citation + +If you make use of cdec, please cite: + +C. Dyer, A. Lopez, J. Ganitkevitch, J. Weese, F. Ture, P. Blunsom, H. Setiawan, V. Eidelman, and P. Resnik. cdec: A Decoder, Alignment, and Learning Framework for Finite-State and Context-Free Translation Models. In *Proceedings of ACL*, July, 2010. [[bibtex](http://www.cdec-decoder.org/cdec.bibtex.txt)] [[pdf](http://www.aclweb.org/anthology/P/P10/P10-4002.pdf)] diff --git a/compound-split/README.md b/compound-split/README.md new file mode 100644 index 000000000..b7491007a --- /dev/null +++ b/compound-split/README.md @@ -0,0 +1,51 @@ +Instructions for running the compound splitter, which is a reimplementation +and extension (more features, larger non-word list) of the model described in + + C. Dyer. (2009) Using a maximum entropy model to build segmentation + lattices for MT. In Proceedings of NAACL HLT 2009, + Boulder, Colorado, June 2009 + +If you use this software, please cite this paper. + + +GENERATING 1-BEST SEGMENTATIONS AND LATTICES +------------------------------------------------------------------------------ + +Here are some sample invokations: + + ./compound-split.pl --output 1best < infile.txt > out.1best.txt + Segment infile.txt according to the 1-best segmentation file. + + ./compound-split.pl --output plf < infile.txt > out.plf + + ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf + This generates denser lattices than usual (the default beam threshold + is 2.2, higher numbers do less pruning) + + +MODEL TRAINING (only for the adventuresome) +------------------------------------------------------------------------------ + +I've included some training data for training a German language lattice +segmentation model, and if you want to explore, you can or change the data. +If you're especially adventuresome, you can add features to cdec (the current +feature functions are found in ff_csplit.cc). The training/references are +in the file: + + dev.in-ref + +The format is the unsegmented form on the right and the reference lattice on +the left, separated by a triple pipe ( ||| ). Note that the segmentation +model inserts a # as the first word, so your segmentation references must +include this. + +To retrain the model (using MAP estimation of a conditional model), do the +following: + + cd de + ./TRAIN + +Note, the optimization objective is supposed to be non-convex, but i haven't +found much of an effect of where I initialize things. But I haven't looked +very hard- this might be something to explore. + diff --git a/compound-split/cdec-de.ini b/compound-split/cdec-de.ini new file mode 100644 index 000000000..1573dd522 --- /dev/null +++ b/compound-split/cdec-de.ini @@ -0,0 +1,6 @@ +formalism=csplit +intersection_strategy=full +weights=de/weights.trained +#weights=de/weights.noun-only-1best-only +feature_function=CSplit_BasicFeatures de/large_dict.de.gz de/badlist.de.gz de/wordlist.de +feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.lm.gz diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl new file mode 100755 index 000000000..93ac3b201 --- /dev/null +++ b/compound-split/compound-split.pl @@ -0,0 +1,177 @@ +#!/usr/bin/perl -w + +use strict; +my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } +use Getopt::Long; +use IPC::Open2; + +my $CDEC = "$script_dir/../decoder/cdec"; +my $LANG = 'de'; + +my $BEAM = 2.1; +my $OUTPUT = 'plf'; +my $HELP; +my $VERBOSE; +my $PRESERVE_CASE; + +GetOptions("decoder=s" => \$CDEC, + "language=s" => \$LANG, + "beam=f" => \$BEAM, + "output=s" => \$OUTPUT, + "verbose" => \$VERBOSE, + "preserve_case" => \$PRESERVE_CASE, + "help" => \$HELP + ) or usage(); + +usage() if $HELP; + +chdir $script_dir; + +if ($VERBOSE) { $VERBOSE = ""; } else { $VERBOSE = " 2> /dev/null"; } +$LANG = lc $LANG; +die "Can't find $CDEC\n" unless -f $CDEC; +die "Can't execute $CDEC\n" unless -x $CDEC; +die "Don't know about language: $LANG\n" unless -d "./$LANG"; +my $CONFIG="cdec-$LANG.ini"; +die "Can't find $CONFIG" unless -f $CONFIG; +die "--output must be '1best' or 'plf'\n" unless ($OUTPUT =~ /^(plf|1best)$/); +check_dependencies($CONFIG, $LANG); +print STDERR "(Run with --help for options)\n"; +print STDERR "LANGUAGE: $LANG\n"; +print STDERR " OUTPUT: $OUTPUT\n"; + +my $CMD = "$CDEC -c $CONFIG"; +my $IS_PLF; +if ($OUTPUT eq 'plf') { + $IS_PLF = 1; + $CMD .= " --csplit_preserve_full_word --csplit_output_plf --beam_prune $BEAM"; +} +$CMD .= $VERBOSE; + +print STDERR "Executing: $CMD\n"; + +open2(\*OUT, \*IN, $CMD) or die "Couldn't fork: $!"; +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(IN,":utf8"); +binmode(OUT,":utf8"); + +while() { + chomp; + s/^\s+//; + s/\s+$//; + my @words = split /\s+/; + my @res = (); + my @todo = (); + my @casings = (); + for (my $i=0; $i < scalar @words; $i++) { + my $word = lc $words[$i]; + if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) { + push @casings, 0; + if ($IS_PLF) { + push @res, "(('" . escape($word) . "',0,1),),"; + } else { + if ($PRESERVE_CASE) { + push @res, $words[$i]; + } else { + push @res, $word; + } + } + } else { + push @casings, guess_casing($words[$i]); + push @res, undef; + push @todo, $word; + } + } + if (scalar @todo > 0) { + # print STDERR "TODO: @todo\n"; + my $tasks = join "\n", @todo; + print IN "$tasks\n"; + for (my $i = 0; $i < scalar @res; $i++) { + if (!defined $res[$i]) { + my $seg = ; + chomp $seg; + unless ($IS_PLF) { + $seg =~ s/^# //o; + } + if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); } + $res[$i] = $seg; + } + } + } + if ($IS_PLF) { + print '('; + print join '', @res; + print ")\n"; + } else { + print "@res\n"; + } +} + +close IN; +close OUT; + +sub recase_words { + my $word = shift; + $word =~ s/\b(\w)/\u$1/g; + return $word; +} + +sub escape { + $_ = shift; + s/\\/\\\\/g; + s/'/\\'/g; + return $_; +} + +sub guess_casing { + my $word = shift @_; + if (lc($word) eq $word) { return 0; } else { return 1; } +} + +sub usage { + print <){ + chomp; + my @x = split /\s+/; + for my $f (@x) { + push @files, $f if ($f =~ /\.gz$/); + } + } + close F; + my $c = 0; + for my $file (@files) { + $c++ if -f $file; + } + if ($c != scalar @files) { + print STDERR <) { + chomp; + s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g; + $_ = lc $_; + my @words = split /\s+/; + for my $w (@words) { + next if length($w) == 0; + $d{$w}++; + $z++; + } +} +my $lz = log($z); +for my $w (sort {$d{$b} <=> $d{$a}} keys %d) { + my $c = $lz-log($d{$w}); + print "$w $c\n"; +} + diff --git a/configure.ac b/configure.ac new file mode 100644 index 000000000..6b1287683 --- /dev/null +++ b/configure.ac @@ -0,0 +1,234 @@ +AC_CONFIG_MACRO_DIR([m4]) +AC_INIT([cdec],[2014-01-28]) +AC_CONFIG_SRCDIR([decoder/cdec.cc]) +AM_INIT_AUTOMAKE +AC_CONFIG_HEADERS(config.h) +AC_PROG_LIBTOOL +AC_PROG_LEX +case $LEX in +:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; +esac +OLD_CXXFLAGS=$CXXFLAGS +AC_PROG_CC +AC_PROG_CXX +CXXFLAGS=$OLD_CXXFLAGS +AX_CXX_COMPILE_STDCXX_11([],[mandatory]) +AC_LANG_CPLUSPLUS +AC_OPENMP +BOOST_REQUIRE([1.44]) +BOOST_FILESYSTEM +BOOST_PROGRAM_OPTIONS +BOOST_SYSTEM +BOOST_SERIALIZATION +BOOST_TEST +BOOST_THREADS +AM_PATH_PYTHON +AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H)) +AC_CHECK_LIB(rt, clock_gettime) +AC_CHECK_LIB(dl, dlopen) +AC_CHECK_HEADERS(zlib.h, + AC_CHECK_LIB(z, gzread,[ + AC_DEFINE(HAVE_ZLIB,[],[Do we have zlib]) + ZLIBS="$ZLIBS -lz" + ])) + +AC_CHECK_HEADERS(bzlib.h, + AC_CHECK_LIB(bz2, BZ2_bzReadOpen,[ + AC_DEFINE(HAVE_BZLIB,[],[Do we have bzlib]) + ZLIBS="$ZLIBS -lbz2" + ])) + +AC_CHECK_HEADERS(lzma.h, + AC_CHECK_LIB(lzma, lzma_code,[ + AC_DEFINE(HAVE_XZLIB,[],[Do we have lzma]) + ZLIBS="$ZLIBS -llzma" + ])) + +AC_ARG_ENABLE(mpi, + [ --enable-mpi Build MPI binaries, assumes mpi.h is present ], + [ mpi=yes + ]) +AM_CONDITIONAL([MPI], [test "x$mpi" = xyes]) + +if test "x$mpi" = xyes +then + AC_DEFINE([HAVE_MPI], [1], [flag for MPI]) + LIBS="$LIBS -lboost_mpi" +fi + +AM_CONDITIONAL([HAVE_METEOR], false) +AC_ARG_WITH(meteor, + [AC_HELP_STRING([--with-meteor=PATH], [(optional) path to METEOR jar])], + [with_meteor=$withval], + [with_meteor=no] + ) + +if test "x$with_meteor" != 'xno' +then + AC_CHECK_FILE([$with_meteor], + [AC_DEFINE([HAVE_METEOR], [1], [flag for METEOR jar library])], + [AC_MSG_ERROR([Cannot find METEOR jar!])]) + AC_SUBST(METEOR_JAR,"${with_meteor}") + AM_CONDITIONAL([HAVE_METEOR], true) +fi + +AM_CONDITIONAL([HAVE_CMPH], false) +AC_ARG_WITH(cmph, + [AC_HELP_STRING([--with-cmph=PATH], [(optional) path to cmph perfect hashing library])], + [with_cmph=$withval], + [with_cmph=no] + ) + +if test "x$with_cmph" != 'xno' +then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS -I${with_cmph}/include" + + AC_CHECK_HEADER(cmph.h, + [AC_DEFINE([HAVE_CMPH], [1], [flag for cmph perfect hashing library])], + [AC_MSG_ERROR([Cannot find cmph library!])]) + + LDFLAGS="$LDFLAGS -L${with_cmph}/lib" + AC_CHECK_LIB(cmph, cmph_search) + AM_CONDITIONAL([HAVE_CMPH], true) +fi + +AM_CONDITIONAL([HAVE_GTEST], false) +AC_ARG_WITH(gtest, + [AC_HELP_STRING([--with-gtest=DIR], [(optional) path to Google Test library])], + [with_gtest=$withval], + [with_gtest=no] + ) + +AM_CONDITIONAL([HAVE_GMOCK], false) +AC_ARG_WITH(gmock, + [AC_HELP_STRING([--with-gmock=DIR], [(optional) path to Google Mock library])], + [with_gmock=$withval], + [with_gmock=no] + ) + +if test "x$with_gtest" != 'xno' +then + gtest_CPPFLAGS="-I${with_gtest}/include" + gtest_LDFLAGS="-L${with_gtest} -L${with_gtest}/lib" + gtest_LIBS="-lgtest_main -lgtest -lpthread" + + SAVECPP_FLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $gtest_CPPFLAGS" + AC_CHECK_HEADER(${with_gtest}/include/gtest/gtest.h, + [AC_DEFINE([HAVE_GTEST], [1], [flag for Google Test header])], + [AC_MSG_ERROR([Cannot find Google Test headers!])] + ) + + SAVE_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS $gtest_LDFLAGS" + SAVE_LIBS="$LIBS" + # Google Test needs pthreads. + AC_CHECK_LIB([pthread], + [pthread_mutex_init], + [], + [AC_MSG_ERROR([Cannot find pthread library])] + ) + AX_CXX_CHECK_LIB([gtest], + [testing::TestInfo::name() const], + [], + [AC_MSG_ERROR([Cannot find Google Test library libgtest])] + ) + AC_CHECK_LIB([gtest_main], + [main], + [], + [AC_MSG_ERROR([Cannot find Google Test library libgtest_main])] + ) + + AC_SUBST(AS_TR_CPP([GTEST_CPPFLAGS]), ["$gtest_CPPFLAGS"]) + AC_SUBST(AS_TR_CPP([GTEST_LDFLAGS]), ["$gtest_LDFLAGS"]) + AC_SUBST(AS_TR_CPP([GTEST_LIBS]), ["$gtest_LIBS"]) + + + if test "x$with_gmock" != 'xno' + then + gmock_CPPFLAGS="-I${with_gmock}/include" + gmock_LDFLAGS="-L${with_gmock} -L${with_gmock}/lib" + gmock_LIBS="-lgmock" + + CPPFLAGS="$CPPFLAGS $gmock_CPPFLAGS" + AC_CHECK_HEADER(${with_gmock}/include/gmock/gmock.h, + [AC_DEFINE([HAVE_GMOCK], [1], [flag for Google Mock header])], + [AC_MSG_ERROR([Cannot find Google Mock headers!])] + ) + + LDFLAGS="$LDFLAGS $gmock_LDFLAGS" + AX_CXX_CHECK_LIB([gmock], + [testing::Expectation], + [], + [AC_MSG_ERROR([Cannot find Google Mock library libgmock])] + ) + + AC_SUBST(AS_TR_CPP([GMOCK_CPPFLAGS]), ["$gmock_CPPFLAGS"]) + AC_SUBST(AS_TR_CPP([GMOCK_LDFLAGS]), ["$gmock_LDFLAGS"]) + AC_SUBST(AS_TR_CPP([GMOCK_LIBS]), ["$gmock_LIBS"]) + AM_CONDITIONAL([HAVE_GMOCK], true) + fi + + CPPFLAGS="$SAVE_CPPFLAGS" + LDFLAGS="$SAVE_LDFLAGS" + LIBS="$SAVE_LIBS" + AM_CONDITIONAL([HAVE_GTEST], true) +fi + +#BOOST_THREADS +CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" +LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS" +# $BOOST_THREAD_LDFLAGS" +LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $ZLIBS" +# $BOOST_THREAD_LIBS" + +AC_CHECK_HEADER(google/dense_hash_map, + [AC_DEFINE([HAVE_SPARSEHASH], [1], [flag for google::dense_hash_map])]) + +AC_PROG_INSTALL + +CPPFLAGS="-DPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" +CXXFLAGS="$CXX11_SWITCH $CXXFLAGS -fPIC -g -O3" +CFLAGS="$CFLAGS -fPIC -g -O3" + +if test "x$HAVE_CXX11" = "x0"; then + CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP" +fi + +# core cdec stuff +AC_CONFIG_FILES([Makefile]) +AC_CONFIG_FILES([utils/Makefile]) +AC_CONFIG_FILES([mteval/Makefile]) +AC_CONFIG_FILES([mteval/meteor_jar.cc]) +AC_CONFIG_FILES([decoder/Makefile]) +AC_CONFIG_FILES([python/setup.py]) +AC_CONFIG_FILES([extractor/Makefile]) +AC_CONFIG_FILES([word-aligner/Makefile]) + +# KenLM stuff +AC_CONFIG_FILES([klm/util/double-conversion/Makefile]) +AC_CONFIG_FILES([klm/util/stream/Makefile]) +AC_CONFIG_FILES([klm/util/Makefile]) +AC_CONFIG_FILES([klm/lm/Makefile]) +AC_CONFIG_FILES([klm/search/Makefile]) +AC_CONFIG_FILES([klm/lm/builder/Makefile]) + +# training stuff +AC_CONFIG_FILES([training/Makefile]) +AC_CONFIG_FILES([training/utils/Makefile]) +AC_CONFIG_FILES([training/liblbfgs/Makefile]) +AC_CONFIG_FILES([training/crf/Makefile]) +AC_CONFIG_FILES([training/dpmert/Makefile]) +AC_CONFIG_FILES([training/pro/Makefile]) +AC_CONFIG_FILES([training/rampion/Makefile]) +AC_CONFIG_FILES([training/minrisk/Makefile]) +AC_CONFIG_FILES([training/mira/Makefile]) +AC_CONFIG_FILES([training/latent_svm/Makefile]) +AC_CONFIG_FILES([training/dtrain/Makefile]) + +# external feature function example code +AC_CONFIG_FILES([example_extff/Makefile]) + +AC_OUTPUT + diff --git a/corpus/README.md b/corpus/README.md new file mode 100644 index 000000000..adc35b849 --- /dev/null +++ b/corpus/README.md @@ -0,0 +1,37 @@ +This directory contains a number of useful scripts that are helpful for preprocessing parallel and monolingual corpora. They are provided for convenience and may be very useful, but their functionality will often be supplainted by other, more specialized tools. + +Many of these scripts assume that the input is [UTF-8 encoded](http://en.wikipedia.org/wiki/UTF-8). + +## Paste parallel files together + +This script reads one line at a time from a set of files and concatenates them with a triple pipe separator (`|||`) in the output. This is useful for generating parallel corpora files for training or evaluation: + + ./paste-files.pl file.a file.b file.c [...] + +## Punctuation Normalization and Tokenization + +This script tokenizes text in any language (well, it does a good job in most languages, and in some it will completely go crazy): + + ./tokenize-anything.sh < input.txt > output.txt + +It also normalizes a lot of unicode symbols and even corrects some common encoding errors. It can be applied to monolingual and parallel corpora directly. + +## Text lowercasing + +This script also does what it says, provided your input is in UTF8: + + ./lowercase.pl < input.txt > output.txt + +## Length ratio filtering (for parallel corpora) + +This script computes statistics about sentence length ratios in a parallel corpus and removes sentences that are statistical outliers. This tends to remove extremely poorly aligned sentence pairs or sentence pairs that would otherwise be difficult to align: + + ./filter-length.pl input.src-trg > output.src-trg + +## Add infrequent self-transaltions to a parallel corpus + +This script identifies rare words (those that occur less than 2 times in the corpus) and which have the same orthographic form in both the source and target language. Several copies of these words are then inserted at the end of the corpus that is written, which improves alignment quality. + + ./add-self-translations.pl input.src-trg > output.src-trg + + diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl new file mode 100755 index 000000000..d707ce29c --- /dev/null +++ b/corpus/add-self-translations.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w +use strict; + +# ADDS SELF-TRANSLATIONS OF POORLY ATTESTED WORDS TO THE PARALLEL DATA + +my %df; +my %def; +while(<>) { +# print; + chomp; + my ($sf, $se) = split / \|\|\| /; + die "Format error: $_\n" unless defined $sf && defined $se; + my @fs = split /\s+/, $sf; + my @es = split /\s+/, $se; + for my $f (@fs) { + $df{$f}++; + for my $e (@es) { + if ($f eq $e) { $def{$f}++; } + } + } +} + +for my $k (sort keys %def) { + next if $df{$k} > 4; + print "$k ||| $k\n"; + print "$k ||| $k\n"; + print "$k ||| $k\n"; +} + diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl new file mode 100755 index 000000000..d7608c5ec --- /dev/null +++ b/corpus/add-sos-eos.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3); +my $filec = shift @ARGV; +my $filea = shift @ARGV; +my $ofilea = shift @ARGV; +open C, "<$filec" or die "Can't read $filec: $!"; +if ($filea) { + open A, "<$filea" or die "Can't read $filea: $!"; + open OA, ">$ofilea" or die "Can't write $ofilea: $!"; +} +binmode(C, ":utf8"); +binmode(STDOUT, ":utf8"); +print STDERR "Adding and markers to input...\n"; +print STDERR " Reading corpus: $filec\n"; +print STDERR " Writing corpus: STDOUT\n"; +print STDERR "Reading alignments: $filea\n" if $filea; +print STDERR "Writing alignments: $ofilea\n" if $filea; + +my $lines = 0; +while() { + $lines++; + die "ERROR. Input line $filec:$lines should not contain SGML markup" if /; + die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa; + chomp $aa; + my ($ff, $ee) = @fields; + die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "ERROR. Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print OA "@oas\n"; + } + print "$o\n"; +} +if ($filea) { + close OA; + my $aa = ; + die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa; +} +print STDERR "\nSUCCESS. Processed $lines lines.\n"; + diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl new file mode 100755 index 000000000..0af3b23ca --- /dev/null +++ b/corpus/cut-corpus.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; + +my $x = shift @ARGV; +my @ind = split /,/, $x; +my @o = (); +for my $ff (@ind) { + if ($ff =~ /^\d+$/) { + push @o, $ff - 1; + } elsif ($ff =~ /^(\d+)-(\d+)$/) { + my $a = $1; + my $b = $2; + die "$a-$b is a bad range in input: $x\n" unless $b > $a; + for (my $i=$a; $i <= $b; $i++) { + push @o, $i - 1; + } + } else { + die "Bad input: $x\n"; + } +} + +while(<>) { + chomp; + my @fields = split /\s*\|\|\|\s*/; + my @sf; + for my $i (@o) { + my $y = $fields[$i]; + if (!defined $y) { $y= ''; } + push @sf, $y; + } + print join(' ||| ', @sf) . "\n"; +} + + diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl new file mode 100755 index 000000000..2e257cdac --- /dev/null +++ b/corpus/filter-length.pl @@ -0,0 +1,150 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +##### EDIT THESE SETTINGS #################################################### +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include +my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? +############################################################################## + +die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2; +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { + my $fp = shift @ARGV; + die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; + $MAX_LENGTH=$1; +} + +my $corpus = shift @ARGV; + +die "Cannot read from STDIN\n" if $corpus eq '-'; +my $ff = "<$corpus"; +$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; + +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR " Parallel corpus: $corpus\n"; + +open F,$ff or die "Can't read $corpus: $!"; +binmode(F,":utf8"); + +my $rat_max = log(9); +my $lrm = 0; +my $zerof = 0; +my $zeroe = 0; +my $bad_format = 0; +my $absbadrat = 0; +my $overlene = 0; +my $overlenf = 0; +my $lines = 0; +my @lograts = (); +while() { + $lines++; + if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } + elsif ($lines % 2500 == 0) { print STDERR "."; } + my ($sf, $se, @d) = split /\s*\|\|\|\s*/; + if (scalar @d != 0 or !defined $se) { + $bad_format++; + if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { + die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_"; + } + next; + } + my @fs = split /\s+/, $sf; + my @es = split /\s+/, $se; + my $flen = scalar @fs; + my $elen = scalar @es; + if ($flen == 0) { + $zerof++; + next; + } + if ($elen == 0) { + $zeroe++; + next; + } + if ($flen > $MAX_LENGTH) { + $overlenf++; + next; + } + if ($elen > $MAX_LENGTH) { + $overlene++; + next; + } + if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN || + $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) { + my $lograt = log($flen) - log($elen); + if (abs($lograt) > $rat_max) { + $absbadrat++; + next; + } + $lrm += $lograt; + push @lograts, $lograt; + } +} +close F; + +print STDERR "\nComputing statistics...\n"; +my $lmean = $lrm / scalar @lograts; + +my $lsd = 0; +for my $lr (@lograts) { + $lsd += ($lr - $lmean)**2; +} +$lsd = sqrt($lsd / scalar @lograts); +@lograts = (); + +my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf + $bad_format; +my $discard_rate = int(10000 * $pass1_discard / $lines) / 100; +print STDERR " Total lines: $lines\n"; +print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n"; +print STDERR " Mean F:E ratio: " . exp($lmean) . "\n"; +print STDERR " StdDev F:E ratio: " . exp($lsd) . "\n"; +print STDERR "Writing...\n"; +open F,$ff or die "Can't reread $corpus: $!"; +binmode(F,":utf8"); +my $to = 0; +my $zviol = 0; +my $worstz = -1; +my $worst = "\n"; +$lines = 0; +while() { + $lines++; + if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } + elsif ($lines % 2500 == 0) { print STDERR "."; } + my ($sf, $se, @d) = split / \|\|\| /; + if (scalar @d != 0 or !defined $se) { next; } + my @fs = split /\s+/, $sf; + my @es = split /\s+/, $se; + my $flen = scalar @fs; + my $elen = scalar @es; + next if ($flen == 0); + next if ($elen == 0); + next if ($flen > $MAX_LENGTH); + next if ($elen > $MAX_LENGTH); + if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN || + $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) { + my $lograt = log($flen) - log($elen); + if (abs($lograt) > $rat_max) { + $absbadrat++; + next; + } + my $zscore = abs($lograt - $lmean) / $lsd; + if ($elen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && + $flen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && $zscore > $worstz) { $worstz = $zscore; $worst = $_; } + if ($zscore > $MAX_ZSCORE) { + $zviol++; + next; + } + print; + } else { + print; + } + $to++; +} +my $discard_rate2 = int(10000 * $zviol / ($lines - $pass1_discard)) / 100; +print STDERR "\n Lines printed: $to\n Ratio violations: $zviol\t(discard rate = $discard_rate2%)\n"; +print STDERR " Worst z-score: $worstz\n sentence: $worst"; +exit 0; + diff --git a/corpus/lowercase.pl b/corpus/lowercase.pl new file mode 100755 index 000000000..9fd91dac2 --- /dev/null +++ b/corpus/lowercase.pl @@ -0,0 +1,9 @@ +#!/usr/bin/perl -w +use strict; +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +while() { + $_ = lc $_; + print; +} + diff --git a/corpus/moses-scfg-to-cdec.pl b/corpus/moses-scfg-to-cdec.pl new file mode 100755 index 000000000..9b8e36179 --- /dev/null +++ b/corpus/moses-scfg-to-cdec.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + my ($src, $trg, $feats, $al) = split / \|\|\| /; + # [X][NP] von [X][NP] [X] ||| [X][NP] 's [X][NP] [S] ||| 0.00110169 0.0073223 2.84566e-06 0.0027702 0.0121867 2.718 0.606531 ||| 0-0 1-1 2-2 ||| 635 245838 2 + + my @srcs = split /\s+/, $src; + my @trgs = split /\s+/, $trg; + my $lhs = pop @trgs; + $lhs =~ s/&apos;/'/g; + $lhs =~ s/'/'/g; + $lhs =~ s/,/COMMA/g; + my $ntc = 0; + my $sc = 0; + my @of = (); + my $x = pop @srcs; + my %d = (); # src index to nonterminal count + die "Expected [X]" unless $x eq '[X]'; + my %amap = (); + my @als = split / /, $al; + for my $st (@als) { + my ($s, $t) = split /-/, $st; + $amap{$t} = $s; + } + for my $f (@srcs) { + if ($f =~ /^\[X\]\[([^]]+)\]$/) { + $ntc++; + my $nt = $1; + $nt =~ s/&apos;/'/g; + $nt =~ s/'/'/g; + $nt =~ s/,/COMMA/g; + push @of, "[$nt]"; + $d{$sc} = $ntc; + } elsif ($f =~ /^\[[^]]+\]$/) { + die "Unexpected $f"; + } else { + push @of, $f; + } + $sc++; + } + my @oe = (); + my $ind = 0; + for my $e (@trgs) { + if ($e =~ /^\[X\]\[([^]]+)\]$/) { + my $imap = $d{$amap{$ind}}; + push @oe, "[$imap]"; + } else { + push @oe, $e; + } + $ind++; + } + my ($fe, $ef, $j, $lfe, $lef, $dummy, $of) = split / /, $feats; + next if $lef eq '0'; + next if $lfe eq '0'; + next if $ef eq '0'; + next if $fe eq '0'; + next if $j eq '0'; + next if $of eq '0'; + $ef = sprintf('%.6g', log($ef)); + $fe = sprintf('%.6g',log($fe)); + $j = sprintf('%.6g',log($j)); + $lef = sprintf('%.6g',log($lef)); + $lfe = sprintf('%.6g',log($lfe)); + $of = sprintf('%.6g',log($of)); + print "$lhs ||| @of ||| @oe ||| RuleCount=1 FgivenE=$fe EgivenF=$ef Joint=$j LexEgivenF=$lef LexFgivenE=$lfe Other=$of\n"; +} + +# [X][ADVP] angestiegen [X] ||| rose [X][ADVP] [VP] ||| 0.0538131 0.0097508 0.00744224 0.0249653 0.000698602 2.718 0.606531 ||| 0-1 1-0 ||| 13 94 2 diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl new file mode 100755 index 000000000..ef2cd9370 --- /dev/null +++ b/corpus/paste-files.pl @@ -0,0 +1,61 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1; + +my @fhs = (); +for my $file (@ARGV) { + my $fh; + if ($file =~ /\.gz$/) { + open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!"; + } else { + open $fh, "<$file" or die "Can't read $file: $!"; + } + binmode($fh,":utf8"); + push @fhs, $fh; +} +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $bad = 0; +my $lc = 0; +my $done = 0; +my $fl = 0; +while(1) { + my @line; + $lc++; + if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; } + elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; } + my $anum = 0; + for my $fh (@fhs) { + my $r = <$fh>; + if (!defined $r) { + die "Mismatched number of lines.\n" if scalar @line > 0; + $done = 1; + last; + } + $r =~ s/\r//g; + chomp $r; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/\s+/ /g; + $r =~ s/^ +//; + $r =~ s/ +$//; + $anum++; + push @line, $r; + } + last if $done; + print STDOUT join(' ||| ', @line) . "\n"; +} +print STDERR "\n" if $fl; +for (my $i = 1; $i < scalar @fhs; $i++) { + my $fh = $fhs[$i]; + my $r = <$fh>; + die "Mismatched number of lines.\n" if defined $r; +} +print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0; + diff --git a/corpus/support/README b/corpus/support/README new file mode 100644 index 000000000..fdbd523e7 --- /dev/null +++ b/corpus/support/README @@ -0,0 +1,2 @@ +Run ./tokenize.sh to tokenize text +Edit eng_token_patterns and eng_token_list to add rules for things not to segment diff --git a/corpus/support/fix-contract.pl b/corpus/support/fix-contract.pl new file mode 100755 index 000000000..49e889812 --- /dev/null +++ b/corpus/support/fix-contract.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl -w +$|++; + +use strict; +while(<>) { + #s/ (pre|anti|re|pro|inter|intra|multi|e|x|neo) - / $1- /ig; + #s/ - (year) - (old)/ -$1-$2/ig; + s/ ' (s|m|ll|re|d|ve) / '$1 /ig; + s/n ' t / n't /ig; + print; +} + diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl new file mode 100755 index 000000000..fe03727b2 --- /dev/null +++ b/corpus/support/fix-eos.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl -w +$|++; + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while() { + s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s; + print; +} diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl new file mode 100755 index 000000000..3eee06669 --- /dev/null +++ b/corpus/support/quote-norm.pl @@ -0,0 +1,193 @@ +#!/usr/bin/perl -w +$|++; +use strict; +use utf8; +binmode(STDIN,"utf8"); +binmode(STDOUT,"utf8"); +while() { + chomp; + $_ = " $_ "; + + # Delete control characters: + s/[\x{00}-\x{1f}]//g; + + # PTB --> normal + s/-LRB-/(/g; + s/-RRB-/)/g; + s/-LSB-/[/g; + s/-RSB-/]/g; + s/-LCB-/{/g; + s/-RCB-/}/g; + s/ gon na / gonna /g; + + # Regularize named HTML/XML escapes: + s/&\s*lt\s*;//gi; # HTML closing angle bracket + s/&\s*squot\s*;/'/gi; # HTML single quote + s/&\s*quot\s*;/"/gi; # HTML double quote + s/&\s*nbsp\s*;/ /gi; # HTML non-breaking space + s/'/\'/g; # HTML apostrophe + s/&\s*amp\s*;/&/gi; # HTML ampersand (last) + + # Regularize known HTML numeric codes: + s/&\s*#\s*160\s*;/ /gi; # no-break space + s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus + s/&\s*#45\s*;/--/g; # hyphen-minus + + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + + # Regularlize spaces: + s/\x{ad}//g; # soft hyphen + s/\x{200C}//g; # zero-width non-joiner + s/\x{a0}/ /g; # non-breaking space + s/\x{2009}/ /g; # thin space + s/\x{2028}/ /g; # "line separator" + s/\x{2029}/ /g; # "paragraph separator" + s/\x{202a}/ /g; # "left-to-right embedding" + s/\x{202b}/ /g; # "right-to-left embedding" + s/\x{202c}/ /g; # "pop directional formatting" + s/\x{202d}/ /g; # "left-to-right override" + s/\x{202e}/ /g; # "right-to-left override" + s/\x{85}/ /g; # "next line" + s/\x{fffd}/ /g; # "replacement character" + s/\x{feff}/ /g; # byte-order mark + s/\x{fdd3}/ /g; # "unicode non-character" + + # Convert other Windows 1252 characters to UTF-8 + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign + + # Currency and measure conversions: + s/ (\d\d): (\d\d)/ $1:$2/g; + s/[\x{20a0}]\x{20ac}]/ EUR /g; + s/[\x{00A3}]/ GBP /g; + s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; + s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; + + # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8: + s/�c/--/g; # long dash + s/\x{e2}\x{20ac}oe/\"/g; # opening double quote + s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote + s/\x{e2}\x{20ac}\x{9d}/\"/g; # closing double quote + s/\x{e2}\x{20ac}\x{2122}/\'/g; # apostrophe + s/\x{e2}\x{20ac}\x{201c}/ -- /g; # en dash? + s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash? + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + s/\x{c3}\x{9f}/\x{df}/g; # esset + s/\x{c3}\x{0178}/\x{df}/g; # esset + s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut + s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut + s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut + s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{96}/\x{d6}/g; # O umlaut + s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut + s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign + + # Regularize quotes: + s/ˇ/'/g; # caron + s/´/'/g; # acute accent + s/`/'/g; # grave accent + s/ˉ/'/g; # modified letter macron + s/ ,,/ "/g; # ghetto low-99 quote + s/``/"/g; # latex-style left quote + s/''/"/g; # latex-style right quote + s/\x{300c}/"/g; # left corner bracket + s/\x{300d}/"/g; # right corner bracket + s/\x{3003}/"/g; # ditto mark + s/\x{00a8}/"/g; # diaeresis + s/\x{92}/\'/g; # curly apostrophe + s/\x{2019}/\'/g; # curly apostrophe + s/\x{f03d}/\'/g; # curly apostrophe + s/\x{b4}/\'/g; # curly apostrophe + s/\x{2018}/\'/g; # curly single open quote + s/\x{201a}/\'/g; # low-9 quote + s/\x{93}/\"/g; # curly left quote + s/\x{201c}/\"/g; # curly left quote + s/\x{94}/\"/g; # curly right quote + s/\x{201d}/\"/g; # curly right quote + s/\x{2033}/\"/g; # curly right quote + s/\x{201e}/\"/g; # low-99 quote + s/\x{84}/\"/g; # low-99 quote (bad enc) + s/\x{201f}/\"/g; # high-rev-99 quote + s/\x{ab}/\"/g; # opening guillemet + s/\x{bb}/\"/g; # closing guillemet + s/\x{0301}/'/g; # combining acute accent + s/\x{203a}/\"/g; # angle quotation mark + s/\x{2039}/\"/g; # angle quotation mark + + # Space inverted punctuation: + s/¡/ ¡ /g; + s/¿/ ¿ /g; + + # Russian abbreviations: + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; + + # Convert foreign numerals into Arabic numerals + tr/०-९/0-9/; # devangari + tr/౦-౯/0-9/; # telugu + tr/೦-೯/0-9/; # kannada + tr/೦-௯/0-9/; # tamil + tr/൦-൯/0-9/; # malayalam + + # Random punctuation: + tr/!-~/!-~/; + s/、/,/g; + # s/。/./g; + s/\x{85}/.../g; + s/…/.../g; + s/―/--/g; + s/–/--/g; + s/─/--/g; + s/—/--/g; + s/\x{97}/--/g; + s/•/ * /g; + s/\*/ * /g; + s/،/,/g; + s/؟/?/g; + s/ـ/ /g; + s/à ̄/i/g; + s/’/'/g; + s/â€"/"/g; + s/؛/;/g; + + # Regularize ligatures: + s/\x{9c}/oe/g; # "oe" ligature + s/\x{0153}/oe/g; # "oe" ligature + s/\x{8c}/Oe/g; # "OE" ligature + s/\x{0152}/Oe/g; # "OE" ligature + s/\x{fb00}/ff/g; # "ff" ligature + s/\x{fb01}/fi/g; # "fi" ligature + s/\x{fb02}/fl/g; # "fl" ligature + s/\x{fb03}/ffi/g; # "ffi" ligature + s/\x{fb04}/ffi/g; # "ffl" ligature + + s/β/ß/g; # WMT 2010 error + + # Strip extra spaces: + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + + print "$_\n"; +} + diff --git a/corpus/support/token_list b/corpus/support/token_list new file mode 100644 index 000000000..d38638cfd --- /dev/null +++ b/corpus/support/token_list @@ -0,0 +1,509 @@ +##################### hyphenated words added by Fei since 3/7/05 +##X-ray + +# hindi abbreviation patterns +जन. +फर. +अग. +सित. +अक्टू. +अक्तू. +नव. +दिस. +डी.एल. +डी.टी.ओ. +डी.ए. +ए.एस.आई. +डी.टी.ओ. +एम.एस.आर.टी.सी. +बी.बी.एम.बी. +डी.एस.पी. +सी.आर.पी. +एस.डी.एम. +सी.डी.पी.ओ. +बी.डी.ओ. +एस.डी.ओ. +एम.पी.पी. +पी.एच.ई. +एस.एच.ओ. +ए.सी.पी. +यू.पी. +पी.एम. +आर.बी.डी. +वी.पी. +सी.ए.डी.पी. +ए. +बी. +सी. +डी. +ई. +एफ. +जी. +एच. +आई. +जे. +के. +एल. +एम. +एन. +ओ. +पी. +क़यू. +आर. +एस. +टी. +यू. +वी. +डबल्यू. +एक्स. +वाई. +ज़ेड. +ज़ी. + +##################### words made of punct only +:- +:-) +:-( ++= +-= +.= +*= +>= +<= +== +&& +|| +=> +-> +<- +:) +:( +;) + +#################### abbr added by Fei +oz. +fl. +tel. +1. +2. +3. +4. +5. +6. +7. +8. +9. +10. + +##################### abbreviation: words that contain period. +EE.UU. +ee.uu. +U.A.E +Ala. +Ph.D. +min. +max. +z.B. +d.h. +ggf. +ca. +bzw. +bzgl. +Eng. +i.e. +a.m. +am. +A.M. +Apr. +Ariz. +Ark. +Aug. +B.A.T. +B.A.T +Calif. +Co. +Conn. +Corp. +Cos. +D.C. +Dec. +Dept. +Dr. +Drs. +Feb. +Fla. +Fri. +Ga. +Gen. +gen. +GEN. +Gov. +Govt. +Ill. +Inc. +Jan. +Jr. +Jul. +Jun. +Kan. +L.A. +Lieut. +Lt. +Ltd. +Ma. +Mar. +Mass. +Md. +Mfg. +Mgr. +Mio. +Mrd. +Bio. +Minn. +Mo. +Mon. +Mr. +Mrs. +Ms. +Mt. +N.D. +Neb. +Nev. +No. +Nos. +Nov. +Oct. +Okla. +Op. +Ore. +Pa. +p.m +p.m. +I.B.C. +N.T.V +Pres. +Prof. +Prop. +Rd. +Rev. +R.J. +C.L +Rs. +Rte. +Sat. +W.T +Sen. +Sep. +Sept. +Sgt. +Sr. +SR. +St. +Ste. +Sun. +Tenn. +Tex. +Thu. +Tue. +Univ. +Va. +Vt. +Wed. +approx. +dept. +e.g. +E.G. +eg. +est. +etc. +ex. +ext. +ft. +hon. +hr. +hrs. +lab. +lb. +lbs. +mass. +misc. +no. +nos. +nt. +para. +paras. +pct. +prod. +rec. +ref. +rel. +rep. +sq. +st. +stg. +vol. +vs. +U.S. +J.S. +U.N. +u.n. +A. +B. +C. +D. +E. +F. +G. +H. +I. +J. +K. +L. +M. +N. +O. +P. +Q. +R. +S. +T. +U. +V. +W. +X. +Y. +Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. +# Swahili +A.D. +Afr. +A.G. +agh. +A.H. +A.M. +a.s. +B.A. +B.C. +Bi. +B.J. +B.K. +B.O.M. +Brig. +Bro. +bt. +bw. +Bw. +Cap. +C.C. +cCM. +C.I.A. +cit. +C.M.S. +Co. +Corp. +C.S.Sp. +C.W. +D.C. +Dk. +Dkt. +Dk.B. +Dr. +E.C. +e.g. +E.M. +E.n. +etc. +Feb. +F.F.U. +F.M. +Fr. +F.W. +I.C.O. +i.e. +I.L.C. +Inc. +Jan. +J.F. +Jr. +J.S. +J.V.W.A. +K.A.R. +K.A.U. +K.C.M.C. +K.k. +K.K. +k.m. +km. +K.m. +K.N.C.U. +K.O. +K.S. +Ksh. +kt. +kumb. +k.v. +kv. +L.G. +ltd. +Ltd. +M.A. +M.D. +mf. +Mh. +Mhe. +mil. +m.m. +M.m. +Mm. +M.M. +Mr. +Mrs. +M.S. +Mt. +Mw. +M.W. +Mwl. +na. +Na. +N.F. +N.J. +n.k. +nk. +n.k.w. +N.N. +Nov. +O.C.D. +op. +P.C. +Phd. +Ph.D. +P.J. +P.o. +P.O. +P.O.P. +P.P.F. +Prof. +P.s. +P.S. +Q.C. +Rd. +s.a.w. +S.A.W. +S.D. +Sept. +sh. +Sh. +SH. +shs. +Shs. +S.J. +S.L. +S.L.P. +S.s. +S.S. +St. +s.w. +s.w.T. +taz. +Taz. +T.C. +T.E.C. +T.L.P. +T.O.H.S. +Tsh. +T.V. +tz. +uk. +Uk. +U.M.C.A. +U.N. +U.S. +Ush. +U.W.T. +Viii. +Vol. +V.T.C. +W.H. +yamb. +Y.M.C.A. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns new file mode 100644 index 000000000..de64fb2a1 --- /dev/null +++ b/corpus/support/token_patterns @@ -0,0 +1,5 @@ +/^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ +/^(\d|\d\d|\d\d\d)\.$/ + diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl new file mode 100755 index 000000000..f57bc87a9 --- /dev/null +++ b/corpus/support/tokenizer.pl @@ -0,0 +1,709 @@ +#!/usr/bin/env perl +$|++; + +my $script_dir; +BEGIN {$^W = 1; use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } + +use strict; +use utf8; + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +my $debug = 0; + + +############ options: +### for all options: +### 0 means no split on that symbol +### 1 means split on that symbol in all cases. +### 2 means do not split in condition 1. +### n means do not split in any of the conditions in the set {1, 2, ..., n-1}. + + +### prefix +## for "#": #90 +my $Split_On_SharpSign = 2; # 2: do not split on Num, e.g., "#90" + + +############## "infix" +my $Split_On_Tilde = 2; # 2: do not split on Num, e.g., "12~13". + +my $Split_On_Circ = 2; # 2: do not split on Num, e.g, "2^3" + +## for "&" +my $Split_On_AndSign = 2; # 2: do not split on short Name, e.g., "AT&T". + +## for hyphen: 1990-1992 +my $Split_On_Dash = 2; ## 2: do not split on number, e.g., "22-23". +my $Split_On_Underscore = 0; ## 0: do not split by underline + +## for ":": 5:4 +my $Split_On_Semicolon = 2; ## 2: don't split for num, e.g., "5:4" + +########### suffix +## for percent sign: 5% +my $Split_On_PercentSign = 1; ## 2: don't split num, e.g., 5% + +############# others +## for slash: 1/4 +my $Split_On_Slash = 2; ## 2: don't split on number, e.g., 1/4. +my $Split_On_BackSlash = 0; ## 0: do not split on "\", e.g., \t + +### for "$": US$120 +my $Split_On_DollarSign = 2; ### 2: US$120 => "US$ 120" + ### 1: US$120 => "US $ 120" +## for 's etc. +my $Split_NAposT = 1; ## n't +my $Split_AposS = 1; ## 's +my $Split_AposM = 1; ## 'm +my $Split_AposRE = 1; ## 're +my $Split_AposVE = 1; ## 've +my $Split_AposLL = 1; ## 'll +my $Split_AposD = 1; ## 'd + + +### some patterns +my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-'; + +#### step 1: read files + +my $workdir = $script_dir; +my $dict_file = "$workdir/token_list"; +my $word_patt_file = "$workdir/token_patterns"; + +open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); + +# read in the list of words that should not be segmented, +## e.g.,"I.B.M.", co-operation. +my %dict_hash = (); +my $dict_entry = 0; +while(<$dict_fp>){ + chomp; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + tr/A-Z/a-z/; + $dict_hash{$_} = 1; + $dict_entry ++; +} + +open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); +my @word_patts = (); +my $word_patt_num = 0; +while(<$patt_fp>){ + chomp; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + s/^\/(.+)\/$/$1/; # remove / / around the pattern + push(@word_patts, $_); + $word_patt_num ++; +} + + +###### step 2: process the input file +my $orig_token_total = 0; +my $deep_proc_token_total = 0; +my $new_token_total = 0; + +while(){ + chomp(); + s/\x{0970}/./g; # dev abbreviation character + if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^//; + $new_line =~ s/\s*<\s+(p|hl)\s+>/<$1>/; + $new_line =~ s/\s*<\s+\/\s+(p|hl|DOC)\s+>/<\/$1>/; + $new_line =~ s/<\s+\/\s+seg\s+>/<\/seg>/; + if ($new_line =~ /^\s*<\s+DOC\s+/) { + $new_line =~ s/\s+//g; + $new_line =~ s/DOC/DOC /; + $new_line =~ s/sys/ sys/; + } + if ($new_line =~ /^\s*<\s+(refset|srcset)\s+/) { + $new_line =~ s/\s+//g; + $new_line =~ s/(set|src|tgt|trg)/ $1/g; + } + + chomp $new_line; + print STDOUT "$new_line\n"; +} + +######################################################################## + +### tokenize a line. +sub proc_line { + my @params = @_; + my $param_num = scalar @params; + + if(($param_num < 1) || ($param_num > 3)){ + die "wrong number of params for proc_line: $param_num\n"; + } + + my $orig_line = $params[0]; + + $orig_line =~ s/^\s+//; + $orig_line =~ s/\s+$//; + + my @parts = split(/\s+/, $orig_line); + + if($param_num >= 2){ + my $orig_num_ptr = $params[1]; + $$orig_num_ptr = scalar @parts; + } + + my $new_line = ""; + + my $deep_proc_token = 0; + foreach my $part (@parts){ + my $flag = -1; + $new_line .= proc_token($part, \$flag) . " "; + $deep_proc_token += $flag; + } + + if($param_num == 3){ + my $deep_num_ptr = $params[2]; + $$deep_num_ptr = $deep_proc_token; + } + + return $new_line; +} + + + +## Tokenize a str that does not contain " ", return the new string +## The function handles the cases that the token needs not be segmented. +## for other cases, it calls deep_proc_token() +sub proc_token { + my @params = @_; + my $param_num = scalar @params; + if($param_num > 2){ + die "proc_token: wrong number of params: $param_num\n"; + } + + my $token = $params[0]; + + if(!defined($token)){ + return ""; + } + + my $deep_proc_flag; + + if($param_num == 2){ + $deep_proc_flag = $params[1]; + $$deep_proc_flag = 0; + } + + if($debug){ + print STDERR "pro_token:+$token+\n"; + } + + ### step 0: it has only one char + if(($token eq "") || ($token=~ /^.$/)){ + ## print STDERR "see +$token+\n"; + return $token; + } + + ## step 1: check the most common case + if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){ + #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ + ### most common cases + return $token; + } + + ## step 2: check whether it is some NE entity + ### 1.2.4.6 + if($token =~ /^\d+(.\d+)+$/){ + return $token; + } + + if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){ + return $token; + } + + ## 1,234,345.34 + if($token =~ /^\d+(\.\d{3})*,\d+$/){ + ## number + return $token; + } + if($token =~ /^\d+(,\d{3})*\.\d+$/){ + ## number + return $token; + } + if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+.*$/){ + ## twitter hashtag or address + return proc_rightpunc($token); + } + + if($token =~ /^[a-z0-9\_\-]+\@[a-z\d\_\-]+(\.[a-z\d\_\-]+)*(.*)$/i){ + ### email address: xxx@yy.zz + return proc_rightpunc($token); + } + + if($token =~ /^(mailto|http|https|ftp|gopher|telnet|file)\:\/{0,2}([^\.]+)(\.(.+))*$/i){ + ### URL: http://xx.yy.zz + return proc_rightpunc($token); + } + + if($token =~ /^(www)(\.(.+))+$/i){ + ### www.yy.dd/land/ + return proc_rightpunc($token); + } + + if($token =~ /^(\w+\.)+(com|co|edu|org|gov|ly|cz|ru|eu)(\.[a-z]{2,3})?\:{0,2}(\/\S*)?$/i){ + ### URL: upenn.edu/~xx + return proc_rightpunc($token); + } + + if($token =~ /^\(\d{3}\)\d{3}(\-\d{4})($common_right_punc)*$/){ + ## only handle American phone numbers: e.g., (914)244-4567 + return proc_rightpunc($token); + } + + #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]'; + my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]'; + if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){ + ### /nls/p/.... + return $token; + } + + if($token =~ /^\\(($t1)+\\)+($t1)+\\?$/i){ + ### \nls\p\.... + return $token; + } + + ## step 3: check the dictionary + my $token_lc = $token; + $token_lc =~ tr/A-Z/a-z/; + + if(defined($dict_hash{$token_lc})){ + return $token; + } + + ## step 4: check word_patterns + my $i=1; + foreach my $patt (@word_patts){ + if($token_lc =~ /$patt/){ + if($debug){ + print STDERR "+$token+ match pattern $i: +$patt+\n"; + } + return $token; + }else{ + $i++; + } + } + + ## step 5: call deep tokenization + if($param_num == 2){ + $$deep_proc_flag = 1; + } + return deep_proc_token($token); +} + + +### remove punct on the right side +### e.g., xxx@yy.zz, => xxx@yy.zz , +sub proc_rightpunc { + my ($token) = @_; + + $token =~ s/(($common_right_punc)+)$/ $1 /; + if($token =~ /\s/){ + return proc_line($token); + }else{ + return $token; + } +} + + + +####################################### +### return the new token: +### types of punct: +## T1 (2): the punct is always a token by itself no matter where it +### appears: " ; +## T2 (15): the punct that can be a part of words made of puncts only. +## ` ! @ + = [ ] ( ) { } | < > ? +## T3 (15): the punct can be part of a word that contains [a-z\d] +## T3: ~ ^ & : , # * % - _ \ / . $ ' +## infix: ~ (12~13), ^ (2^3), & (AT&T), : , +## prefix: # (#9), * (*3), +## suffix: % (10%), +## infix+prefix: - (-5), _ (_foo), +## more than one position: \ / . $ +## Appos: 'm n't ... + +## 1. separate by puncts in T1 +## 2. separate by puncts in T2 +## 3. deal with punct T3 one by one according to options +## 4. if the token remains unchanged after step 1-3, return the token + +## $line contains at least 2 chars, and no space. +sub deep_proc_token { + my ($line) = @_; + if($debug){ + print STDERR "deep_proc_token: +$line+\n"; + } + + ##### step 0: if it mades up of all puncts, remove one punct at a time. + if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){ + if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){ + ## ++ @@@@ !!! .... + return $line; + } + + if($line =~ /^(.)(.+)$/){ + my $t1 = $1; + my $t2 = $2; + return $t1 . " " . proc_token($t2); + }else{ + ### one char only + print STDERR "deep_proc_token: this should not happen: +$line+\n"; + return $line; + } + } + + ##### step 1: separate by punct T2 on the boundary + my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;'; + if($line =~ s/^(($t2)+)/$1 /){ + return proc_line($line); + } + + if($line =~ s/(($t2)+)$/ $1/){ + return proc_line($line); + } + + ## step 2: separate by punct T2 in any position + if($line =~ s/(($t2)+)/ $1 /g){ + return proc_line($line); + } + + ##### step 3: deal with special puncts in T3. + if($line =~ /^(\,+)(.+)$/){ + my $t1 = $1; + my $t2 = $2; + return proc_token($t1) . " " . proc_token($t2); + } + + if($line =~ /^(.*[^\,]+)(\,+)$/){ + ## 19.3,,, => 19.3 ,,, + my $t1 = $1; + my $t2 = $2; + return proc_token($t1) . " " . proc_token($t2); + } + + ## remove the ending periods that follow number etc. + if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){ + ## 12~13. => 12~13 . + my $t1 = $1; + my $t3 = $3; + return proc_token($t1) . " " . proc_token($t3); + } + + ### deal with "$" + if(($line =~ /\$/) && ($Split_On_DollarSign > 0)){ + my $suc = 0; + if($Split_On_DollarSign == 1){ + ## split on all occasation + $suc = ($line =~ s/(\$+)/ $1 /g); + }else{ + ## split only between $ and number + $suc = ($line =~ s/(\$+)(\d)/$1 $2/g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with "#" + if(($line =~ /\#/) && ($Split_On_SharpSign > 0)){ + my $suc = 0; + if($Split_On_SharpSign >= 2){ + ### keep #50 as a token + $suc = ($line =~ s/(\#+)(\D)/ $1 $2/gi); + }else{ + $suc = ($line =~ s/(\#+)/ $1 /gi); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with ' + if($line =~ /\'/){ + my $suc = ($line =~ s/([^\'])([\']+)$/$1 $2/g); ## xxx'' => xxx '' + + ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't + + ## 'there => ' there '98 => the same + $suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi); + + ## note that \' and \. could interact: e.g., U.S.'s; 're. + if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){ + ## doesn't => does n't + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + ## 's, 't, 'm, 'll, 're, 've: they've => they 've + ## 1950's => 1950 's Co.'s => Co. 's + if($Split_AposS && ($line =~ /^(.+)(\'s)(\W*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposM && ($line =~ /^(.*[a-z]+)(\'m)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + + if($Split_AposRE && ($line =~ /^(.*[a-z]+)(\'re)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposVE && ($line =~ /^(.*[a-z]+)(\'ve)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposLL && ($line =~ /^(.*[a-z]+)(\'ll)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposD && ($line =~ /^(.*[a-z]+)(\'d)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($suc){ + return proc_line($line); + } + } + + + ## deal with "~" + if(($line =~ /\~/) && ($Split_On_Tilde > 0)){ + my $suc = 0; + if($Split_On_Tilde >= 2){ + ## keep 12~13 as one token + $suc += ($line =~ s/(\D)(\~+)/$1 $2 /g); + $suc += ($line =~ s/(\~+)(\D)/ $1 $2/g); + $suc += ($line =~ s/^(\~+)(\d)/$1 $2/g); + $suc += ($line =~ s/(\d)(\~+)$/$1 $2/g); + }else{ + $suc += ($line =~ s/(\~+)/ $1 /g); + } + if($suc){ + return proc_line($line); + } + } + + ## deal with "^" + if(($line =~ /\^/) && ($Split_On_Circ > 0)){ + my $suc = 0; + if($Split_On_Circ >= 2){ + ## keep 12~13 as one token + $suc += ($line =~ s/(\D)(\^+)/$1 $2 /g); + $suc += ($line =~ s/(\^+)(\D)/ $1 $2/g); + }else{ + $suc = ($line =~ s/(\^+)/ $1 /g); + } + if($suc){ + return proc_line($line); + } + } + + ## deal with ":" + if(($line =~ /\:/) && ($Split_On_Semicolon > 0)){ + ## 2: => 2 : + my $suc = ($line =~ s/^(\:+)/$1 /); + $suc += ($line =~ s/(\:+)$/ $1/); + if($Split_On_Semicolon >= 2){ + ## keep 5:4 as one token + $suc += ($line =~ s/(\D)(\:+)/$1 $2 /g); + $suc += ($line =~ s/(\:+)(\D)/ $1 $2/g); + }else{ + $suc += ($line =~ s/(\:+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ### deal with hyphen: 1992-1993. 21st-24th + if(($line =~ /\-/) && ($Split_On_Dash > 0)){ + my $suc = ($line =~ s/(\-{2,})/ $1 /g); + if($Split_On_Dash >= 2){ + ## keep 1992-1993 as one token + $suc += ($line =~ s/(\D)(\-+)/$1 $2 /g); + $suc += ($line =~ s/(\-+)(\D)/ $1 $2/g); + }else{ + ### always split on "-" + $suc += ($line =~ s/([\-]+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with "_" + if(($line =~ /\_/) && ($Split_On_Underscore > 0)){ + ### always split on "-" + if($line =~ s/([\_]+)/ $1 /g){ + return proc_line($line); + } + } + + + + ## deal with "%" + if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){ + my $suc = 0; + if($Split_On_PercentSign >= 2){ + $suc += ($line =~ s/(\D)(\%+)/$1 $2/g); + }else{ + $suc += ($line =~ s/(\%+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + + ### deal with "/": 4/5 + if(($line =~ /\//) && ($Split_On_Slash > 0)){ + my $suc = 0; + if($Split_On_Slash >= 2){ + $suc += ($line =~ s/(\D)(\/+)/$1 $2 /g); + $suc += ($line =~ s/(\/+)(\D)/ $1 $2/g); + }else{ + $suc += ($line =~ s/(\/+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + + ### deal with comma: 123,456 + if($line =~ /\,/){ + my $suc = 0; + $suc += ($line =~ s/([^\d]),/$1 , /g); ## xxx, 1923 => xxx , 1923 + $suc += ($line =~ s/\,\s*([^\d])/ , $1/g); ## 1923, xxx => 1923 , xxx + + $suc += ($line =~ s/,([\d]{1,2}[^\d])/ , $1/g); ## 1,23 => 1 , 23 + $suc += ($line =~ s/,([\d]{4,}[^\d])/ , $1/g); ## 1,2345 => 1 , 2345 + + $suc += ($line =~ s/,([\d]{1,2})$/ , $1/g); ## 1,23 => 1 , 23 + $suc += ($line =~ s/,([\d]{4,})$/ , $1/g); ## 1,2345 => 1 , 2345 + + if($suc){ + return proc_line($line); + } + } + + + ## deal with "&" + if(($line =~ /\&/) && ($Split_On_AndSign > 0)){ + my $suc = 0; + if($Split_On_AndSign >= 2){ + $suc += ($line =~ s/([a-z]{3,})(\&+)/$1 $2 /gi); + $suc += ($line =~ s/(\&+)([a-z]{3,})/ $1 $2/gi); + }else{ + $suc += ($line =~ s/(\&+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with period + if($line =~ /\./){ + if($line =~ /^(([\+|\-])*(\d+\,)*\d*\.\d+\%*)$/){ + ### numbers: 3.5 + return $line; + } + + if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){ + ## I.B.M. + my $t1 = $1 . $5; + my $t3 = $6; + return $t1 . " ". proc_token($t3); + } + + ## Feb.. => Feb. . + if($line =~ /^(.*[^\.])(\.)(\.*)$/){ + my $p1 = $1; + my $p2 = $2; + my $p3 = $3; + + my $p1_lc = $p1; + $p1_lc =~ tr/A-Z/a-z/; + + if(defined($dict_hash{$p1_lc . $p2})){ + ## Dec.. => Dec. . + return $p1 . $p2 . " " . proc_token($p3); + }elsif(defined($dict_hash{$p1_lc})){ + return $p1 . " " . proc_token($p2 . $p3); + }else{ + ## this. => this . + return proc_token($p1) . " " . proc_token($p2 . $p3); + } + } + + if($line =~ s/(\.+)(.+)/$1 $2/g){ + return proc_line($line); + } + } + + + ## no pattern applies + return $line; +} + diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl new file mode 100755 index 000000000..e574f861a --- /dev/null +++ b/corpus/support/utf8-normalize-batch.pl @@ -0,0 +1,28 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +$|++; + +if (scalar(@ARGV) != 1) { + print STDERR "usage: $0 \"CMD\"\n"; + exit(2); +} + +$CMD = $ARGV[0]; + +while () { + s/\r\n*/\n/g; + $PID = open2(*SOUT, *SIN, $CMD); + print SIN "$_\n"; + close(SIN); + $_ = ; + close(SOUT); + waitpid($PID, 0); + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; +} diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh new file mode 100755 index 000000000..af9895ba0 --- /dev/null +++ b/corpus/support/utf8-normalize.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# this is the location on malbec, if you want to run on another machine +# ICU may be installed in /usr or /usr/local +ICU_DIR=/usr0/tools/icu +UCONV_BIN=$ICU_DIR/bin/uconv +UCONV_LIB=$ICU_DIR/lib + +if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ] +then + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB + if [ ! -x $UCONV_BIN ] + then + echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2 + exit + fi + CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip" +else + if which uconv > /dev/null + then + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" + else + echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2 + CMD="iconv -f utf8 -t utf8 -c" + fi +fi + +if [[ $# == 1 && $1 == "--batchline" ]]; then + perl $(dirname $0)/utf8-normalize-batch.pl "$CMD" +else + perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' \ + |$CMD \ + |/usr/bin/perl -w -e ' + $|++; + while (<>) { + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; + }' +fi diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh new file mode 100755 index 000000000..bca954d15 --- /dev/null +++ b/corpus/tokenize-anything.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +ROOTDIR=`dirname $0` +SUPPORT=$ROOTDIR/support + +if [[ $# == 1 && $1 == '-u' ]] ; then + NORMARGS="--batchline" + SEDFLAGS="-u" +else + NORMARGS="" + SEDFLAGS="" +fi + +$SUPPORT/utf8-normalize.sh $NORMARGS | + $SUPPORT/quote-norm.pl | + $SUPPORT/tokenizer.pl | + $SUPPORT/fix-eos.pl | + sed $SEDFLAGS -e 's/ al - / al-/g' | + $SUPPORT/fix-contract.pl | + sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' | + perl -e '$|++; while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/; print;}' + diff --git a/corpus/untok.pl b/corpus/untok.pl new file mode 100755 index 000000000..723e78cbe --- /dev/null +++ b/corpus/untok.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w + +use IO::Handle; +STDOUT->autoflush(1); + +while (<>) { + $output = ""; + @tokens = split; + $lspace = 0; + $qflag = 0; + for ($i=0; $i<=$#tokens; $i++) { + $token = $tokens[$i]; + $prev = $next = ""; + $rspace = 1; + if ($i > 0) { + $prev = $tokens[$i-1]; + } + if ($i < $#tokens) { + $next = $tokens[$i+1]; + } + + # possessives join to the left + if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) { + $lspace = 0; + } elsif ($token eq "'" && $prev =~ /s$/) { + $lspace = 0; + + # hyphen only when a hyphen, not a dash + } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) { + $lspace = $rspace = 0; + + # quote marks alternate + } elsif ($token eq '"') { + if ($qflag) { + $lspace = 0; + } else { + $rspace = 0; + } + $qflag = !$qflag; + + # period joins on both sides when a decimal point + } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) { + $lspace = $rspace = 0; + + # Left joiners + } elsif ($token =~ /^[.,:;?!%)\]]$/) { + $lspace = 0; + # Right joiners + } elsif ($token =~ /^[$(\[]$/) { + $rspace = 0; + # Joiners on both sides + } elsif ($token =~ /^[\/]$/) { + $lspace = $rspace = 0; + } + + if ($lspace) { + $output .= " "; + } + $output .= $token; + $lspace = $rspace; + } + print "$output\n"; +} diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh new file mode 100755 index 000000000..dcf8bc59d --- /dev/null +++ b/corpus/utf8-normalize.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script uses ICU uconv (http://site.icu-project.org/), if it's available +# to normalize UTF8 text into a standard form. For information about this +# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization +# Escape characters between 0x00-0x1F are removed + +if which uconv > /dev/null +then + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" +else + echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2 + CMD="iconv -f utf8 -t utf8 -c" +fi + +$CMD | /usr/bin/perl -w -e ' + while (<>) { + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; + }' + diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py new file mode 100755 index 000000000..4357ced63 --- /dev/null +++ b/corpus/xml-tok.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import os +import re +import subprocess +import sys + +# Tokenize XML files with tokenize-anything.sh +# in: The earnings on its 10-year bonds are 28.45%. +# out: The earnings on its 10 - year bonds are 28.45 % . + +def escape(s): + return s.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"').replace('\'', ''') + +def unescape(s): + return s.replace('>', '>').replace('<', '<').replace('"', '"').replace(''', '\'').replace('&', '&') + +def main(): + tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + pieces = [] + eol = len(line) + pos = 0 + while pos < eol: + next = line.find('<', pos) + if next == -1: + next = eol + tok.stdin.write('{}\n'.format(unescape(line[pos:next]))) + pieces.append(escape(tok.stdout.readline().strip())) + if next == eol: + break + pos = line.find('>', next + 1) + if pos == -1: + pos = eol + else: + pos += 1 + pieces.append(line[next:pos]) + sys.stdout.write('{}\n'.format(' '.join(pieces).strip())) + tok.stdin.close() + tok.wait() + +if __name__ == '__main__': + main() diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c new file mode 100644 index 000000000..5e392bc6c --- /dev/null +++ b/decoder/JSON_parser.c @@ -0,0 +1,1012 @@ +/* JSON_parser.c */ + +/* 2007-08-24 */ + +/* +Copyright (c) 2005 JSON.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* + Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. + + For the added features the license above applies also. + + Changelog: + 2009-05-17 + Incorporated benrudiak@googlemail.com fix for UTF16 decoding. + + 2009-05-14 + Fixed float parsing bug related to a locale being set that didn't + use '.' as decimal point character (charles@transmissionbt.com). + + 2008-10-14 + Renamed states.IN to states.IT to avoid name clash which IN macro + defined in windef.h (alexey.pelykh@gmail.com) + + 2008-07-19 + Removed some duplicate code & debugging variable (charles@transmissionbt.com) + + 2008-05-28 + Made JSON_value structure ansi C compliant. This bug was report by + trisk@acm.jhu.edu + + 2008-05-20 + Fixed bug reported by charles@transmissionbt.com where the switching + from static to dynamic parse buffer did not copy the static parse + buffer's content. +*/ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "JSON_parser.h" + +#ifdef _MSC_VER +# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ +# pragma warning(disable:4996) // unsecure sscanf +# endif +#endif + + +#define true 1 +#define false 0 +#define __ -1 /* the universal error code */ + +/* values chosen so that the object size is approx equal to one page (4K) */ +#ifndef JSON_PARSER_STACK_SIZE +# define JSON_PARSER_STACK_SIZE 128 +#endif + +#ifndef JSON_PARSER_PARSE_BUFFER_SIZE +# define JSON_PARSER_PARSE_BUFFER_SIZE 3500 +#endif + +typedef unsigned short UTF16; + +struct JSON_parser_struct { + JSON_parser_callback callback; + void* ctx; + signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; + UTF16 utf16_high_surrogate; + long depth; + long top; + signed char* stack; + long stack_capacity; + char decimal_point; + char* parse_buffer; + size_t parse_buffer_capacity; + size_t parse_buffer_count; + size_t comment_begin_offset; + signed char static_stack[JSON_PARSER_STACK_SIZE]; + char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; +}; + +#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) + +/* + Characters are mapped into these character classes. This allows for + a significant reduction in the size of the state transition table. +*/ + + + +enum classes { + C_SPACE, /* space */ + C_WHITE, /* other whitespace */ + C_LCURB, /* { */ + C_RCURB, /* } */ + C_LSQRB, /* [ */ + C_RSQRB, /* ] */ + C_COLON, /* : */ + C_COMMA, /* , */ + C_QUOTE, /* " */ + C_BACKS, /* \ */ + C_SLASH, /* / */ + C_PLUS, /* + */ + C_MINUS, /* - */ + C_POINT, /* . */ + C_ZERO , /* 0 */ + C_DIGIT, /* 123456789 */ + C_LOW_A, /* a */ + C_LOW_B, /* b */ + C_LOW_C, /* c */ + C_LOW_D, /* d */ + C_LOW_E, /* e */ + C_LOW_F, /* f */ + C_LOW_L, /* l */ + C_LOW_N, /* n */ + C_LOW_R, /* r */ + C_LOW_S, /* s */ + C_LOW_T, /* t */ + C_LOW_U, /* u */ + C_ABCDF, /* ABCDF */ + C_E, /* E */ + C_ETC, /* everything else */ + C_STAR, /* * */ + NR_CLASSES +}; + +static int ascii_class[128] = { +/* + This array maps the 128 ASCII characters into character classes. + The remaining Unicode characters should be mapped to C_ETC. + Non-whitespace control characters are errors. +*/ + __, __, __, __, __, __, __, __, + __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __, + __, __, __, __, __, __, __, __, + __, __, __, __, __, __, __, __, + + C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, + C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, + C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + + C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, + + C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, + C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC +}; + + +/* + The state codes. +*/ +enum states { + GO, /* start */ + OK, /* ok */ + OB, /* object */ + KE, /* key */ + CO, /* colon */ + VA, /* value */ + AR, /* array */ + ST, /* string */ + ES, /* escape */ + U1, /* u1 */ + U2, /* u2 */ + U3, /* u3 */ + U4, /* u4 */ + MI, /* minus */ + ZE, /* zero */ + IT, /* integer */ + FR, /* fraction */ + E1, /* e */ + E2, /* ex */ + E3, /* exp */ + T1, /* tr */ + T2, /* tru */ + T3, /* true */ + F1, /* fa */ + F2, /* fal */ + F3, /* fals */ + F4, /* false */ + N1, /* nu */ + N2, /* nul */ + N3, /* null */ + C1, /* / */ + C2, /* / * */ + C3, /* * */ + FX, /* *.* *eE* */ + D1, /* second UTF-16 character decoding started by \ */ + D2, /* second UTF-16 character proceeded by u */ + NR_STATES +}; + +enum actions +{ + CB = -10, /* comment begin */ + CE = -11, /* comment end */ + FA = -12, /* false */ + TR = -13, /* false */ + NU = -14, /* null */ + DE = -15, /* double detected by exponent e E */ + DF = -16, /* double detected by fraction . */ + SB = -17, /* string begin */ + MX = -18, /* integer detected by minus */ + ZX = -19, /* integer detected by zero */ + IX = -20, /* integer detected by 1-9 */ + EX = -21, /* next char is escaped */ + UC = -22 /* Unicode character read */ +}; + + +static int state_transition_table[NR_STATES][NR_CLASSES] = { +/* + The state transition table takes the current state and the current symbol, + and returns either a new state or an action. An action is represented as a + negative number. A JSON text is accepted if at the end of the text the + state is OK and if the mode is MODE_DONE. + + white 1-9 ABCDF etc + space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */ +/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, +/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, +/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, +/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, +/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, +/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, +/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, +/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, +/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, +/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, +/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, +/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, +/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, +/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, +/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, +/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, +}; + + +/* + These modes can be pushed on the stack. +*/ +enum modes { + MODE_ARRAY = 1, + MODE_DONE = 2, + MODE_KEY = 3, + MODE_OBJECT = 4 +}; + +static int +push(JSON_parser jc, int mode) +{ +/* + Push a mode onto the stack. Return false if there is overflow. +*/ + jc->top += 1; + if (jc->depth < 0) { + if (jc->top >= jc->stack_capacity) { + size_t bytes_to_allocate; + jc->stack_capacity *= 2; + bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); + if (jc->stack == &jc->static_stack[0]) { + jc->stack = (signed char*)malloc(bytes_to_allocate); + memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); + } else { + jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); + } + } + } else { + if (jc->top >= jc->depth) { + return false; + } + } + + jc->stack[jc->top] = mode; + return true; +} + + +static int +pop(JSON_parser jc, int mode) +{ +/* + Pop the stack, assuring that the current mode matches the expectation. + Return false if there is underflow or if the modes mismatch. +*/ + if (jc->top < 0 || jc->stack[jc->top] != mode) { + return false; + } + jc->top -= 1; + return true; +} + + +#define parse_buffer_clear(jc) \ + do {\ + jc->parse_buffer_count = 0;\ + jc->parse_buffer[0] = 0;\ + } while (0) + +#define parse_buffer_pop_back_char(jc)\ + do {\ + assert(jc->parse_buffer_count >= 1);\ + --jc->parse_buffer_count;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +void delete_JSON_parser(JSON_parser jc) +{ + if (jc) { + if (jc->stack != &jc->static_stack[0]) { + free((void*)jc->stack); + } + if (jc->parse_buffer != &jc->static_parse_buffer[0]) { + free((void*)jc->parse_buffer); + } + free((void*)jc); + } +} + + +JSON_parser +new_JSON_parser(JSON_config* config) +{ +/* + new_JSON_parser starts the checking process by constructing a JSON_parser + object. It takes a depth parameter that restricts the level of maximum + nesting. + + To continue the process, call JSON_parser_char for each character in the + JSON text, and then call JSON_parser_done to obtain the final result. + These functions are fully reentrant. +*/ + + int depth = 0; + JSON_config default_config; + + JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); + + memset(jc, 0, sizeof(*jc)); + + + /* initialize configuration */ + init_JSON_config(&default_config); + + /* set to default configuration if none was provided */ + if (config == NULL) { + config = &default_config; + } + + depth = config->depth; + + /* We need to be able to push at least one object */ + if (depth == 0) { + depth = 1; + } + + jc->state = GO; + jc->top = -1; + + /* Do we want non-bound stack? */ + if (depth > 0) { + jc->stack_capacity = depth; + jc->depth = depth; + if (depth <= (int)COUNTOF(jc->static_stack)) { + jc->stack = &jc->static_stack[0]; + } else { + jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); + } + } else { + jc->stack_capacity = COUNTOF(jc->static_stack); + jc->depth = -1; + jc->stack = &jc->static_stack[0]; + } + + /* set parser to start */ + push(jc, MODE_DONE); + + /* set up the parse buffer */ + jc->parse_buffer = &jc->static_parse_buffer[0]; + jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); + parse_buffer_clear(jc); + + /* set up callback, comment & float handling */ + jc->callback = config->callback; + jc->ctx = config->callback_ctx; + jc->allow_comments = config->allow_comments != 0; + jc->handle_floats_manually = config->handle_floats_manually != 0; + + /* set up decimal point */ + jc->decimal_point = *localeconv()->decimal_point; + + return jc; +} + +static void grow_parse_buffer(JSON_parser jc) +{ + size_t bytes_to_allocate; + jc->parse_buffer_capacity *= 2; + bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); + if (jc->parse_buffer == &jc->static_parse_buffer[0]) { + jc->parse_buffer = (char*)malloc(bytes_to_allocate); + memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); + } else { + jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); + } +} + +#define parse_buffer_push_back_char(jc, c)\ + do {\ + if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ + jc->parse_buffer[jc->parse_buffer_count++] = c;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +#define assert_is_non_container_type(jc) \ + assert( \ + jc->type == JSON_T_NULL || \ + jc->type == JSON_T_FALSE || \ + jc->type == JSON_T_TRUE || \ + jc->type == JSON_T_FLOAT || \ + jc->type == JSON_T_INTEGER || \ + jc->type == JSON_T_STRING) + + +static int parse_parse_buffer(JSON_parser jc) +{ + if (jc->callback) { + JSON_value value, *arg = NULL; + + if (jc->type != JSON_T_NONE) { + assert_is_non_container_type(jc); + + switch(jc->type) { + case JSON_T_FLOAT: + arg = &value; + if (jc->handle_floats_manually) { + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + } else { + /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ + + /* not checking with end pointer b/c there may be trailing ws */ + value.vu.float_value = strtod(jc->parse_buffer, NULL); + } + break; + case JSON_T_INTEGER: + arg = &value; + sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); + break; + case JSON_T_STRING: + arg = &value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + break; + } + + if (!(*jc->callback)(jc->ctx, jc->type, arg)) { + return false; + } + } + } + + parse_buffer_clear(jc); + + return true; +} + +#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) +#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) +#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) +static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; + +static int decode_unicode_char(JSON_parser jc) +{ + int i; + unsigned uc = 0; + char* p; + int trail_bytes; + + assert(jc->parse_buffer_count >= 6); + + p = &jc->parse_buffer[jc->parse_buffer_count - 4]; + + for (i = 12; i >= 0; i -= 4, ++p) { + unsigned x = *p; + + if (x >= 'a') { + x -= ('a' - 10); + } else if (x >= 'A') { + x -= ('A' - 10); + } else { + x &= ~0x30u; + } + + assert(x < 16); + + uc |= x << i; + } + + /* clear UTF-16 char from buffer */ + jc->parse_buffer_count -= 6; + jc->parse_buffer[jc->parse_buffer_count] = 0; + + /* attempt decoding ... */ + if (jc->utf16_high_surrogate) { + if (IS_LOW_SURROGATE(uc)) { + uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); + trail_bytes = 3; + jc->utf16_high_surrogate = 0; + } else { + /* high surrogate without a following low surrogate */ + return false; + } + } else { + if (uc < 0x80) { + trail_bytes = 0; + } else if (uc < 0x800) { + trail_bytes = 1; + } else if (IS_HIGH_SURROGATE(uc)) { + /* save the high surrogate and wait for the low surrogate */ + jc->utf16_high_surrogate = uc; + return true; + } else if (IS_LOW_SURROGATE(uc)) { + /* low surrogate without a preceding high surrogate */ + return false; + } else { + trail_bytes = 2; + } + } + + jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); + + for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { + jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); + } + + jc->parse_buffer[jc->parse_buffer_count] = 0; + + return true; +} + +static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) +{ + jc->escaped = 0; + /* remove the backslash */ + parse_buffer_pop_back_char(jc); + switch(next_char) { + case 'b': + parse_buffer_push_back_char(jc, '\b'); + break; + case 'f': + parse_buffer_push_back_char(jc, '\f'); + break; + case 'n': + parse_buffer_push_back_char(jc, '\n'); + break; + case 'r': + parse_buffer_push_back_char(jc, '\r'); + break; + case 't': + parse_buffer_push_back_char(jc, '\t'); + break; + case '"': + parse_buffer_push_back_char(jc, '"'); + break; + case '\\': + parse_buffer_push_back_char(jc, '\\'); + break; + case '/': + parse_buffer_push_back_char(jc, '/'); + break; + case 'u': + parse_buffer_push_back_char(jc, '\\'); + parse_buffer_push_back_char(jc, 'u'); + break; + default: + return false; + } + + return true; +} + +#define add_char_to_parse_buffer(jc, next_char, next_class) \ + do { \ + if (jc->escaped) { \ + if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ + return false; \ + } else if (!jc->comment) { \ + if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ + parse_buffer_push_back_char(jc, (char)next_char); \ + } \ + } \ + } while (0) + + +#define assert_type_isnt_string_null_or_bool(jc) \ + assert(jc->type != JSON_T_FALSE); \ + assert(jc->type != JSON_T_TRUE); \ + assert(jc->type != JSON_T_NULL); \ + assert(jc->type != JSON_T_STRING) + + +int +JSON_parser_char(JSON_parser jc, int next_char) +{ +/* + After calling new_JSON_parser, call this function for each character (or + partial character) in your JSON text. It can accept UTF-8, UTF-16, or + UTF-32. It returns true if things are looking ok so far. If it rejects the + text, it returns false. +*/ + int next_class, next_state; + +/* + Determine the character's class. +*/ + if (next_char < 0) { + return false; + } + if (next_char >= 128) { + next_class = C_ETC; + } else { + next_class = ascii_class[next_char]; + if (next_class <= __) { + return false; + } + } + + add_char_to_parse_buffer(jc, next_char, next_class); + +/* + Get the next state from the state transition table. +*/ + next_state = state_transition_table[jc->state][next_class]; + if (next_state >= 0) { +/* + Change the state. +*/ + jc->state = next_state; + } else { +/* + Or perform one of the actions. +*/ + switch (next_state) { +/* Unicode character */ + case UC: + if(!decode_unicode_char(jc)) { + return false; + } + /* check if we need to read a second UTF-16 char */ + if (jc->utf16_high_surrogate) { + jc->state = D1; + } else { + jc->state = ST; + } + break; +/* escaped char */ + case EX: + jc->escaped = 1; + jc->state = ES; + break; +/* integer detected by minus */ + case MX: + jc->type = JSON_T_INTEGER; + jc->state = MI; + break; +/* integer detected by zero */ + case ZX: + jc->type = JSON_T_INTEGER; + jc->state = ZE; + break; +/* integer detected by 1-9 */ + case IX: + jc->type = JSON_T_INTEGER; + jc->state = IT; + break; + +/* floating point number detected by exponent*/ + case DE: + assert_type_isnt_string_null_or_bool(jc); + jc->type = JSON_T_FLOAT; + jc->state = E1; + break; + +/* floating point number detected by fraction */ + case DF: + assert_type_isnt_string_null_or_bool(jc); + if (!jc->handle_floats_manually) { +/* + Some versions of strtod (which underlies sscanf) don't support converting + C-locale formated floating point values. +*/ + assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); + jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; + } + jc->type = JSON_T_FLOAT; + jc->state = FX; + break; +/* string begin " */ + case SB: + parse_buffer_clear(jc); + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_STRING; + jc->state = ST; + break; + +/* n */ + case NU: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_NULL; + jc->state = N1; + break; +/* f */ + case FA: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_FALSE; + jc->state = F1; + break; +/* t */ + case TR: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_TRUE; + jc->state = T1; + break; + +/* closing comment */ + case CE: + jc->comment = 0; + assert(jc->parse_buffer_count == 0); + assert(jc->type == JSON_T_NONE); + jc->state = jc->before_comment_state; + break; + +/* opening comment */ + case CB: + if (!jc->allow_comments) { + return false; + } + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + assert(jc->parse_buffer_count == 0); + assert(jc->type != JSON_T_STRING); + switch (jc->stack[jc->top]) { + case MODE_ARRAY: + case MODE_OBJECT: + switch(jc->state) { + case VA: + case AR: + jc->before_comment_state = jc->state; + break; + default: + jc->before_comment_state = OK; + break; + } + break; + default: + jc->before_comment_state = jc->state; + break; + } + jc->type = JSON_T_NONE; + jc->state = C1; + jc->comment = 1; + break; +/* empty } */ + case -9: + parse_buffer_clear(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_KEY)) { + return false; + } + jc->state = OK; + break; + +/* } */ case -8: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_OBJECT)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* ] */ case -7: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { + return false; + } + if (!pop(jc, MODE_ARRAY)) { + return false; + } + + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* { */ case -6: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_KEY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = OB; + break; + +/* [ */ case -5: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_ARRAY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = AR; + break; + +/* string end " */ case -4: + parse_buffer_pop_back_char(jc); + switch (jc->stack[jc->top]) { + case MODE_KEY: + assert(jc->type == JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = CO; + + if (jc->callback) { + JSON_value value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { + return false; + } + } + parse_buffer_clear(jc); + break; + case MODE_ARRAY: + case MODE_OBJECT: + assert(jc->type == JSON_T_STRING); + if (!parse_parse_buffer(jc)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + default: + return false; + } + break; + +/* , */ case -3: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + switch (jc->stack[jc->top]) { + case MODE_OBJECT: +/* + A comma causes a flip from object mode to key mode. +*/ + if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { + return false; + } + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = KE; + break; + case MODE_ARRAY: + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = VA; + break; + default: + return false; + } + break; + +/* : */ case -2: +/* + A colon causes a flip from key mode to object mode. +*/ + parse_buffer_pop_back_char(jc); + if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = VA; + break; +/* + Bad action. +*/ + default: + return false; + } + } + return true; +} + + +int +JSON_parser_done(JSON_parser jc) +{ + const int result = jc->state == OK && pop(jc, MODE_DONE); + + return result; +} + + +int JSON_parser_is_legal_white_space_string(const char* s) +{ + int c, char_class; + + if (s == NULL) { + return false; + } + + for (; *s; ++s) { + c = *s; + + if (c < 0 || c >= 128) { + return false; + } + + char_class = ascii_class[c]; + + if (char_class != C_SPACE && char_class != C_WHITE) { + return false; + } + } + + return true; +} + + + +void init_JSON_config(JSON_config* config) +{ + if (config) { + memset(config, 0, sizeof(*config)); + + config->depth = JSON_PARSER_STACK_SIZE - 1; + } +} diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h new file mode 100644 index 000000000..de9800721 --- /dev/null +++ b/decoder/JSON_parser.h @@ -0,0 +1,152 @@ +#ifndef JSON_PARSER_H +#define JSON_PARSER_H + +/* JSON_parser.h */ + + +#include + +/* Windows DLL stuff */ +#ifdef _WIN32 +# ifdef JSON_PARSER_DLL_EXPORTS +# define JSON_PARSER_DLL_API __declspec(dllexport) +# else +# define JSON_PARSER_DLL_API __declspec(dllimport) +# endif +#else +# define JSON_PARSER_DLL_API +#endif + +/* Determine the integer type use to parse non-floating point numbers */ +#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1 +typedef long long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld" +#else +typedef long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld" +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum +{ + JSON_T_NONE = 0, + JSON_T_ARRAY_BEGIN, // 1 + JSON_T_ARRAY_END, // 2 + JSON_T_OBJECT_BEGIN, // 3 + JSON_T_OBJECT_END, // 4 + JSON_T_INTEGER, // 5 + JSON_T_FLOAT, // 6 + JSON_T_NULL, // 7 + JSON_T_TRUE, // 8 + JSON_T_FALSE, // 9 + JSON_T_STRING, // 10 + JSON_T_KEY, // 11 + JSON_T_MAX // 12 +} JSON_type; + +typedef struct JSON_value_struct { + union { + JSON_int_t integer_value; + + double float_value; + + struct { + const char* value; + size_t length; + } str; + } vu; +} JSON_value; + +typedef struct JSON_parser_struct* JSON_parser; + +/*! \brief JSON parser callback + + \param ctx The pointer passed to new_JSON_parser. + \param type An element of JSON_type but not JSON_T_NONE. + \param value A representation of the parsed value. This parameter is NULL for + JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END, + JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned + as zero-terminated C strings. + + \return Non-zero if parsing should continue, else zero. +*/ +typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value); + + +/*! \brief The structure used to configure a JSON parser object + + \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise + the depth is the limit + \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity. + \param Callback context. This parameter may be NULL. + \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting. + \param allowComments. To allow C style comments in JSON, set to non-zero. + \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero. + + \return The parser object. +*/ +typedef struct { + JSON_parser_callback callback; + void* callback_ctx; + int depth; + int allow_comments; + int handle_floats_manually; +} JSON_config; + + +/*! \brief Initializes the JSON parser configuration structure to default values. + + The default configuration is + - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c) + - no parsing, just checking for JSON syntax + - no comments + + \param config. Used to configure the parser. +*/ +JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config); + +/*! \brief Create a JSON parser object + + \param config. Used to configure the parser. Set to NULL to use the default configuration. + See init_JSON_config + + \return The parser object. +*/ +JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config); + +/*! \brief Destroy a previously created JSON parser object. */ +JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc); + +/*! \brief Parse a character. + + \return Non-zero, if all characters passed to this function are part of are valid JSON. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char); + +/*! \brief Finalize parsing. + + Call this method once after all input characters have been consumed. + + \return Non-zero, if all parsed characters are valid JSON, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc); + +/*! \brief Determine if a given string is valid JSON white space + + \return Non-zero if the string is valid, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s); + + +#ifdef __cplusplus +} +#endif + + +#endif /* JSON_PARSER_H */ diff --git a/decoder/Makefile.am b/decoder/Makefile.am new file mode 100644 index 000000000..c85f17ed5 --- /dev/null +++ b/decoder/Makefile.am @@ -0,0 +1,160 @@ +bin_PROGRAMS = cdec + +noinst_PROGRAMS = \ + trule_test \ + hg_test \ + parser_test \ + t2s_test \ + grammar_test + +TESTS = trule_test parser_test grammar_test hg_test +t2s_test_SOURCES = t2s_test.cc +t2s_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a +parser_test_SOURCES = parser_test.cc +parser_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a +grammar_test_SOURCES = grammar_test.cc +grammar_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a +hg_test_SOURCES = hg_test.cc +hg_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a +trule_test_SOURCES = trule_test.cc +trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a + +cdec_SOURCES = cdec.cc +cdec_LDFLAGS= -rdynamic +cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../klm/util/double-conversion/libklm_util_double.a + +AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/decoder/test_data\" -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/mteval -I$(top_srcdir)/utils -I$(top_srcdir)/klm + +rule_lexer.cc: rule_lexer.ll + $(LEX) -s -CF -8 -o$@ $< + +noinst_LIBRARIES = libcdec.a + +EXTRA_DIST = test_data rule_lexer.ll + +libcdec_a_SOURCES = \ + JSON_parser.h \ + aligner.h \ + apply_models.h \ + bottom_up_parser.h \ + csplit.h \ + decoder.h \ + earley_composer.h \ + factored_lexicon_helper.h \ + ff.h \ + ff_basic.h \ + ff_bleu.h \ + ff_charset.h \ + ff_context.h \ + ff_csplit.h \ + ff_external.h \ + ff_factory.h \ + ff_klm.h \ + ff_lm.h \ + ff_ngrams.h \ + ff_parse_match.h \ + ff_register.h \ + ff_rules.h \ + ff_ruleshape.h \ + ff_sample_fsa.h \ + ff_soft_syntax.h \ + ff_soft_syntax_mindist.h \ + ff_source_path.h \ + ff_source_syntax.h \ + ff_source_syntax2.h \ + ff_spans.h \ + ff_tagger.h \ + ff_wordalign.h \ + ff_wordset.h \ + ffset.h \ + forest_writer.h \ + freqdict.h \ + grammar.h \ + hg.h \ + hg_intersect.h \ + hg_io.h \ + hg_remove_eps.h \ + hg_sampler.h \ + hg_test.h \ + hg_union.h \ + incremental.h \ + inside_outside.h \ + json_parse.h \ + kbest.h \ + lattice.h \ + lexalign.h \ + lextrans.h \ + nt_span.h \ + oracle_bleu.h \ + phrasebased_translator.h \ + phrasetable_fst.h \ + program_options.h \ + rule_lexer.h \ + sentence_metadata.h \ + sentences.h \ + tagger.h \ + translator.h \ + trule.h \ + viterbi.h \ + aligner.cc \ + apply_models.cc \ + bottom_up_parser.cc \ + cdec.cc \ + cdec_ff.cc \ + csplit.cc \ + decoder.cc \ + earley_composer.cc \ + factored_lexicon_helper.cc \ + ff.cc \ + ff_basic.cc \ + ff_bleu.cc \ + ff_charset.cc \ + ff_context.cc \ + ff_csplit.cc \ + ff_external.cc \ + ff_factory.cc \ + ff_klm.cc \ + ff_lm.cc \ + ff_ngrams.cc \ + ff_parse_match.cc \ + ff_rules.cc \ + ff_ruleshape.cc \ + ff_soft_syntax.cc \ + ff_soft_syntax_mindist.cc \ + ff_source_path.cc \ + ff_source_syntax.cc \ + ff_source_syntax2.cc \ + ff_spans.cc \ + ff_tagger.cc \ + ff_wordalign.cc \ + ff_wordset.cc \ + ffset.cc \ + forest_writer.cc \ + fst_translator.cc \ + tree2string_translator.cc \ + grammar.cc \ + hg.cc \ + hg_intersect.cc \ + hg_io.cc \ + hg_remove_eps.cc \ + hg_sampler.cc \ + hg_union.cc \ + incremental.cc \ + json_parse.cc \ + lattice.cc \ + lexalign.cc \ + lextrans.cc \ + node_state_hash.h \ + tree_fragment.cc \ + tree_fragment.h \ + maxtrans_blunsom.cc \ + phrasebased_translator.cc \ + phrasetable_fst.cc \ + rescore_translator.cc \ + rule_lexer.cc \ + scfg_translator.cc \ + tagger.cc \ + translator.cc \ + trule.cc \ + viterbi.cc \ + JSON_parser.c diff --git a/decoder/aligner.cc b/decoder/aligner.cc new file mode 100644 index 000000000..232e022ad --- /dev/null +++ b/decoder/aligner.cc @@ -0,0 +1,306 @@ +#include "aligner.h" + +#include +#include + +#include + +#include "array2d.h" +#include "hg.h" +#include "kbest.h" +#include "sentence_metadata.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "alignment_io.h" + +using namespace std; + +// used with lexical models since they may not fully generate the +// source string +void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, + vector >* src_cov) { + src_cov->clear(); + src_cov->resize(g.edges_.size()); + + for (int i = 0; i < g.edges_.size(); ++i) { + const Hypergraph::Edge& edge = g.edges_[i]; + set& cov = (*src_cov)[i]; + // no words + if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) + continue; + // aligned to NULL (crf ibm variant only) + if (edge.prev_i_ == -1 || edge.i_ == -1) { + cov.insert(-1); + continue; + } + assert(edge.j_ >= 0); + assert(edge.prev_j_ >= 0); + if (edge.Arity() == 0) { + for (int k = edge.prev_i_; k < edge.prev_j_; ++k) + cov.insert(k); + } else { + // note: this code, which handles mixed NT and terminal + // rules assumes that nodes uniquely define a src and trg + // span. + int k = edge.prev_i_; + int j = 0; + const vector& f = edge.rule_->e(); // rules are inverted + while (k < edge.prev_j_) { + if (f[j] > 0) { + cov.insert(k); + // cerr << "src: " << k << endl; + ++k; + ++j; + } else { + const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; + assert(tailnode.in_edges_.size() > 0); + // any edge will do: + const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; + //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span + k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span + ++j; + } + } + } + } +} + +int SourceEdgeCoveragesUsingTree(const Hypergraph& g, + int node_id, + int span_start, + vector* spans, + vector >* src_cov) { + const Hypergraph::Node& node = g.nodes_[node_id]; + int k = -1; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const int edge_id = node.in_edges_[i]; + const Hypergraph::Edge& edge = g.edges_[edge_id]; + set& cov = (*src_cov)[edge_id]; + const vector& f = edge.rule_->e(); // rules are inverted + int j = 0; + k = span_start; + while (j < f.size()) { + if (f[j] > 0) { + cov.insert(k); + ++k; + ++j; + } else { + const int tail_node_id = edge.tail_nodes_[-f[j]]; + int &right_edge = (*spans)[tail_node_id]; + if (right_edge < 0) + right_edge = SourceEdgeCoveragesUsingTree(g, tail_node_id, k, spans, src_cov); + k = right_edge; + ++j; + } + } + } + return k; +} + +void SourceEdgeCoveragesUsingTree(const Hypergraph& g, + vector >* src_cov) { + src_cov->clear(); + src_cov->resize(g.edges_.size()); + vector span_sizes(g.nodes_.size(), -1); + SourceEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, src_cov); +} + +int TargetEdgeCoveragesUsingTree(const Hypergraph& g, + int node_id, + int span_start, + vector* spans, + vector >* trg_cov) { + const Hypergraph::Node& node = g.nodes_[node_id]; + int k = -1; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const int edge_id = node.in_edges_[i]; + const Hypergraph::Edge& edge = g.edges_[edge_id]; + set& cov = (*trg_cov)[edge_id]; + int ntc = 0; + const vector& e = edge.rule_->f(); // rules are inverted + int j = 0; + k = span_start; + while (j < e.size()) { + if (e[j] > 0) { + cov.insert(k); + ++k; + ++j; + } else { + const int tail_node_id = edge.tail_nodes_[ntc]; + ++ntc; + int &right_edge = (*spans)[tail_node_id]; + if (right_edge < 0) + right_edge = TargetEdgeCoveragesUsingTree(g, tail_node_id, k, spans, trg_cov); + k = right_edge; + ++j; + } + } + // cerr << "node=" << node_id << ": k=" << k << endl; + } + return k; +} + +void TargetEdgeCoveragesUsingTree(const Hypergraph& g, + vector >* trg_cov) { + trg_cov->clear(); + trg_cov->resize(g.edges_.size()); + vector span_sizes(g.nodes_.size(), -1); + TargetEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, trg_cov); +} + +struct TransitionEventWeightFunction { + typedef SparseVector Result; + inline SparseVector operator()(const Hypergraph::Edge& e) const { + SparseVector result; + result.set_value(e.id_, e.edge_prob_); + return result; + } +}; + +inline void WriteProbGrid(const Array2D& m, ostream* pos) { + ostream& os = *pos; + char b[1024]; + for (int i=0; i* edges) { + bool fix_up_src_spans = false; + if (k_best > 1 && edges) { + cerr << "ERROR: cannot request multiple best alignments and provide an edge set!\n"; + abort(); + } + if (map_instead_of_viterbi) { + if (k_best != 0) { + cerr << "WARNING: K-best alignment extraction not available for MAP, use --aligner_use_viterbi\n"; + } + k_best = 1; + } else { + if (k_best == 0) k_best = 1; + } + const Hypergraph* g = &in_g; + HypergraphP new_hg; + if (!src_lattice.IsSentence() || + !trg_lattice.IsSentence()) { + if (map_instead_of_viterbi) { + cerr << " Lattice alignment: using Viterbi instead of MAP alignment\n"; + } + map_instead_of_viterbi = false; + fix_up_src_spans = !src_lattice.IsSentence(); + } + + KBest::KBestDerivations, ViterbiPathTraversal> kbest(in_g, k_best); + boost::scoped_ptr > kbest_edges; + + for (int best = 0; best < k_best; ++best) { + const KBest::KBestDerivations, ViterbiPathTraversal>::Derivation* d = NULL; + if (!map_instead_of_viterbi) { + d = kbest.LazyKthBest(in_g.nodes_.size() - 1, best); + if (!d) break; // there are fewer than k_best derivations! + const vector& yield = d->yield; + kbest_edges.reset(new vector(in_g.edges_.size(), false)); + for (int i = 0; i < yield.size(); ++i) { + assert(yield[i]->id_ < kbest_edges->size()); + (*kbest_edges)[yield[i]->id_] = true; + } + } + if (!map_instead_of_viterbi || edges) { + if (kbest_edges) edges = kbest_edges.get(); + new_hg = in_g.CreateViterbiHypergraph(edges); + for (int i = 0; i < new_hg->edges_.size(); ++i) + new_hg->edges_[i].edge_prob_ = prob_t::One(); + g = new_hg.get(); + } + + vector edge_posteriors(g->edges_.size(), prob_t::Zero()); + vector trg_sent; + vector src_sent; + if (fix_up_src_spans) { + ViterbiESentence(*g, &src_sent); + } else { + src_sent.resize(src_lattice.size()); + for (int i = 0; i < src_sent.size(); ++i) + src_sent[i] = src_lattice[i][0].label; + } + + ViterbiFSentence(*g, &trg_sent); + + if (edges || !map_instead_of_viterbi) { + for (int i = 0; i < edge_posteriors.size(); ++i) + edge_posteriors[i] = prob_t::One(); + } else { + SparseVector posts; + const prob_t z = InsideOutside, TransitionEventWeightFunction>(*g, &posts); + for (int i = 0; i < edge_posteriors.size(); ++i) + edge_posteriors[i] = posts.value(i) / z; + } + vector > src_cov(g->edges_.size()); + vector > trg_cov(g->edges_.size()); + TargetEdgeCoveragesUsingTree(*g, &trg_cov); + + if (fix_up_src_spans) + SourceEdgeCoveragesUsingTree(*g, &src_cov); + else + SourceEdgeCoveragesUsingParseIndices(*g, &src_cov); + + // figure out the src and reference size; + int src_size = src_sent.size(); + int ref_size = trg_sent.size(); + Array2D align(src_size + 1, ref_size, prob_t::Zero()); + for (int c = 0; c < g->edges_.size(); ++c) { + const prob_t& p = edge_posteriors[c]; + const set& srcs = src_cov[c]; + const set& trgs = trg_cov[c]; + for (set::const_iterator si = srcs.begin(); + si != srcs.end(); ++si) { + for (set::const_iterator ti = trgs.begin(); + ti != trgs.end(); ++ti) { + align(*si + 1, *ti) += p; + } + } + } + new_hg.reset(); + //if (g != &in_g) { g.reset(); } + + prob_t threshold(0.9); + const bool use_soft_threshold = true; // TODO configure + + Array2D grid(src_size, ref_size, false); + for (int j = 0; j < ref_size; ++j) { + if (use_soft_threshold) { + threshold = prob_t::Zero(); + for (int i = 0; i <= src_size; ++i) + if (align(i, j) > threshold) threshold = align(i, j); + //threshold *= prob_t(0.99); + } + for (int i = 0; i < src_size; ++i) + grid(i, j) = align(i+1, j) >= threshold; + } + if (out == &cout && k_best < 2) { + // TODO need to do some sort of verbose flag + WriteProbGrid(align, &cerr); + cerr << grid << endl; + } + (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; + AlignmentIO::SerializePharaohFormat(grid, out); + } +}; + diff --git a/decoder/aligner.h b/decoder/aligner.h new file mode 100644 index 000000000..a34795c91 --- /dev/null +++ b/decoder/aligner.h @@ -0,0 +1,26 @@ +#ifndef _ALIGNER_H_ + +#include +#include +#include +#include "array2d.h" +#include "lattice.h" + +class Hypergraph; +class SentenceMetadata; + +struct AlignerTools { + + // assumption: g contains derivations of input/ref and + // ONLY input/ref. + // if edges is non-NULL, the alignment corresponding to the edge rules will be written + static void WriteAlignment(const Lattice& src, + const Lattice& ref, + const Hypergraph& g, + std::ostream* out, + bool map_instead_of_viterbi = true, + int k_best = 0, + const std::vector* edges = NULL); +}; + +#endif diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc new file mode 100644 index 000000000..9f8bbeade --- /dev/null +++ b/decoder/apply_models.cc @@ -0,0 +1,631 @@ +////TODO: keep model state in forest? + +//TODO: (for many nonterminals, or multi-rescoring pass) either global +//best-first, or group by (NT,span) - use prev forest outside as a (admissable, +//if models are a subset and weights are same) heuristic + +#include "apply_models.h" + +#include +#include +#ifndef HAVE_OLD_CPP +# include +# include +#else +# include +# include +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif + +#include + +#include "node_state_hash.h" +#include "verbose.h" +#include "hg.h" +#include "ff.h" +#include "ffset.h" + +#define NORMAL_CP 1 +#define FAST_CP 2 +#define FAST_CP_2 3 + +using namespace std; + +struct Candidate; +typedef SmallVectorInt JVector; +typedef vector CandidateHeap; +typedef vector CandidateList; + +// default vector size (* sizeof string is memory used) +static const size_t kRESERVE_NUM_NODES = 500000ul; + +// life cycle: candidates are created, placed on the heap +// and retrieved by their estimated cost, when they're +// retrieved, they're incorporated into the +LM hypergraph +// where they also know the head node index they are +// attached to. After they are added to the +LM hypergraph +// vit_prob_ and est_prob_ fields may be updated as better +// derivations are found (this happens since the successor's +// of derivation d may have a better score- they are +// explored lazily). However, the updates don't happen +// when a candidate is in the heap so maintaining the heap +// property is not an issue. +struct Candidate { + int node_index_; // -1 until incorporated + // into the +LM forest + const Hypergraph::Edge* in_edge_; // in -LM forest + Hypergraph::Edge out_edge_; + FFState state_; + const JVector j_; + prob_t vit_prob_; // these are fixed until the cand + // is popped, then they may be updated + prob_t est_prob_; + + Candidate(const Hypergraph::Edge& e, + const JVector& j, + const Hypergraph& out_hg, + const vector& D, + const FFStates& node_states, + const SentenceMetadata& smeta, + const ModelSet& models, + bool is_goal) : + node_index_(-1), + in_edge_(&e), + j_(j) { + InitializeCandidate(out_hg, smeta, D, node_states, models, is_goal); + } + + // used to query uniqueness + Candidate(const Hypergraph::Edge& e, + const JVector& j) : in_edge_(&e), j_(j) {} + + bool IsIncorporatedIntoHypergraph() const { + return node_index_ >= 0; + } + + void InitializeCandidate(const Hypergraph& out_hg, + const SentenceMetadata& smeta, + const vector >& D, + const FFStates& node_states, + const ModelSet& models, + const bool is_goal) { + const Hypergraph::Edge& in_edge = *in_edge_; + out_edge_.rule_ = in_edge.rule_; + out_edge_.feature_values_ = in_edge.feature_values_; + out_edge_.i_ = in_edge.i_; + out_edge_.j_ = in_edge.j_; + out_edge_.prev_i_ = in_edge.prev_i_; + out_edge_.prev_j_ = in_edge.prev_j_; + Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; + tail.resize(j_.size()); + prob_t p = prob_t::One(); + // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; + for (int i = 0; i < tail.size(); ++i) { + const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; + assert(ant.IsIncorporatedIntoHypergraph()); + tail[i] = ant.node_index_; + p *= ant.vit_prob_; + } + prob_t edge_estimate = prob_t::One(); + if (is_goal) { + assert(tail.size() == 1); + const FFState& ant_state = node_states[tail.front()]; + models.AddFinalFeatures(ant_state, &out_edge_, smeta); + } else { + models.AddFeaturesToEdge(smeta, out_hg, node_states, &out_edge_, &state_, &edge_estimate); + } + vit_prob_ = out_edge_.edge_prob_ * p; + est_prob_ = vit_prob_ * edge_estimate; + } +}; + +ostream& operator<<(ostream& os, const Candidate& cand) { + os << "CAND["; + if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } + else { os << "+LM_node=" << cand.node_index_; } + os << " edge=" << cand.in_edge_->id_; + os << " j=<"; + for (int i = 0; i < cand.j_.size(); ++i) + os << (i==0 ? "" : " ") << cand.j_[i]; + os << "> vit=" << log(cand.vit_prob_); + os << " est=" << log(cand.est_prob_); + return os << ']'; +} + +struct HeapCandCompare { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ < r->est_prob_; + } +}; + +struct EstProbSorter { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ > r->est_prob_; + } +}; + +// the same candidate can be added multiple times if +// j is multidimensional (if you're going NW in Manhattan, you +// can first go north, then west, or you can go west then north) +// this is a hash function on the relevant variables from +// Candidate to enforce this. +struct CandidateUniquenessHash { + size_t operator()(const Candidate* c) const { + size_t x = 5381; + x = ((x << 5) + x) ^ c->in_edge_->id_; + for (int i = 0; i < c->j_.size(); ++i) + x = ((x << 5) + x) ^ c->j_[i]; + return x; + } +}; + +struct CandidateUniquenessEquals { + bool operator()(const Candidate* a, const Candidate* b) const { + return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); + } +}; + +typedef unordered_set UniqueCandidateSet; +typedef unordered_map > State2Node; + +class CubePruningRescorer { + +public: + CubePruningRescorer(const ModelSet& m, + const SentenceMetadata& sm, + const Hypergraph& i, + int pop_limit, + Hypergraph* o, + int s = NORMAL_CP ) : + models(m), + smeta(sm), + in(i), + out(*o), + D(in.nodes_.size()), + pop_limit_(pop_limit), + strategy_(s){ + if (!SILENT) cerr << " Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; + node_states_.reserve(kRESERVE_NUM_NODES); + } + + void Apply() { + int num_nodes = in.nodes_.size(); + assert(num_nodes >= 2); + int goal_id = num_nodes - 1; + int pregoal = goal_id - 1; + assert(in.nodes_[pregoal].out_edges_.size() == 1); + if (!SILENT) cerr << " "; + int has = 0; + for (int i = 0; i < in.nodes_.size(); ++i) { + if (!SILENT) { + int needs = (50 * i / in.nodes_.size()); + while (has < needs) { cerr << '.'; ++has; } + } + if (strategy_==NORMAL_CP){ + KBest(i, i == goal_id); + } + if (strategy_==FAST_CP){ + KBestFast(i, i == goal_id); + } + if (strategy_==FAST_CP_2){ + KBestFast2(i, i == goal_id); + } + } + if (!SILENT) { + cerr << endl; + cerr << " Best path: " << log(D[goal_id].front()->vit_prob_) + << "\t" << log(D[goal_id].front()->est_prob_) << endl; + } + out.PruneUnreachable(D[goal_id].front()->node_index_); + FreeAll(); + } + + private: + void FreeAll() { + for (int i = 0; i < D.size(); ++i) { + CandidateList& D_i = D[i]; + for (int j = 0; j < D_i.size(); ++j) + delete D_i[j]; + } + D.clear(); + } + + void IncorporateIntoPlusLMForest(size_t head_node_hash, Candidate* item, State2Node* s2n, CandidateList* freelist) { + Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_); + new_edge->edge_prob_ = item->out_edge_.edge_prob_; + Candidate*& o_item = (*s2n)[item->state_]; + if (!o_item) o_item = item; + + int& node_id = o_item->node_index_; + if (node_id < 0) { + Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_); + new_node->node_hash = cdec::HashNode(head_node_hash, item->state_); // ID is combination of existing state + residual state + node_states_.push_back(item->state_); + node_id = new_node->id_; + } +#if 0 + Hypergraph::Node* node = &out.nodes_[node_id]; + out.ConnectEdgeToHeadNode(new_edge, node); +#else + out.ConnectEdgeToHeadNode(new_edge, node_id); +#endif + // update candidate if we have a better derivation + // note: the difference between the vit score and the estimated + // score is the same for all items with a common residual DP + // state + if (item->vit_prob_ > o_item->vit_prob_) { + assert(o_item->state_ == item->state_); // sanity check! + o_item->est_prob_ = item->est_prob_; + o_item->vit_prob_ = item->vit_prob_; + } + if (item != o_item) freelist->push_back(item); + } + + void KBest(const int vert_index, const bool is_goal) { + // cerr << "KBest(" << vert_index << ")\n"; + CandidateList& D_v = D[vert_index]; + assert(D_v.empty()); + const Hypergraph::Node& v = in.nodes_[vert_index]; + // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; + const vector& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + UniqueCandidateSet unique_cands; + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); + bool is_new = unique_cands.insert(cand.back()).second; + assert(is_new); // these should all be unique! + } +// cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + // cerr << "POPPED: " << *item << endl; + PushSucc(*item, is_goal, &cand, &unique_cands); + IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) + D_v[c++] = i->second; + sort(D_v.begin(), D_v.end(), EstProbSorter()); + // cerr << " expanded to " << D_v.size() << " nodes\n"; + + for (int i = 0; i < cand.size(); ++i) + delete cand[i]; + // freelist is necessary since even after an item merged, it still stays in + // the unique set so it can't be deleted til now + for (int i = 0; i < freelist.size(); ++i) + delete freelist[i]; + } + + void KBestFast(const int vert_index, const bool is_goal) { + // cerr << "KBest(" << vert_index << ")\n"; + CandidateList& D_v = D[vert_index]; + assert(D_v.empty()); + const Hypergraph::Node& v = in.nodes_[vert_index]; + // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; + const vector& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + //init with j<0,0> for all rules-edges that lead to node-(NT-span) + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); + } + // cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + // cerr << "POPPED: " << *item << endl; + + PushSuccFast(*item, is_goal, &cand); + IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (auto& i : state2node) { + D_v[c++] = i.second; + // cerr << "MERGED: " << *i.second << endl; + } + //cerr <<"Node id: "<< vert_index<< endl; + //#ifdef MEASURE_CA + // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + UniqueCandidateSet unique_accepted; + //init with j<0,0> for all rules-edges that lead to node-(NT-span) + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); + } + // cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + bool is_new = unique_accepted.insert(item).second; + assert(is_new); // these should all be unique! + // cerr << "POPPED: " << *item << endl; + + PushSuccFast2(*item, is_goal, &cand, &unique_accepted); + IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i){ + D_v[c++] = i->second; + // cerr << "MERGED: " << *i->second << endl; + } + //cerr <<"Node id: "<< vert_index<< endl; + //#ifdef MEASURE_CA + // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<tail_nodes_[i]].size()) { + Candidate query_unique(*item.in_edge_, j); + if (cs->count(&query_unique) == 0) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + bool is_new = cs->insert(new_cand).second; + assert(is_new); // insert into uniqueness set, sanity check + } + } + } + } + + //PushSucc following unique ancestor generation function + void PushSuccFast(const Candidate& item, const bool is_goal, CandidateHeap* pcand){ + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + } + if(item.j_[i]!=0){ + return; + } + } + } + + //PushSucc only if all ancest Cand are added + void PushSuccFast2(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* ps){ + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate query_unique(*item.in_edge_, j); + if (HasAllAncestors(&query_unique,ps)) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + } + } + } + } + + bool HasAllAncestors(const Candidate* item, UniqueCandidateSet* cs){ + for (int i = 0; i < item->j_.size(); ++i) { + JVector j = item->j_; + --j[i]; + if (j[i] >=0) { + Candidate query_unique(*item->in_edge_, j); + if (cs->count(&query_unique) == 0) { + return false; + } + } + } + return true; + } + + const ModelSet& models; + const SentenceMetadata& smeta; + const Hypergraph& in; + Hypergraph& out; + + vector D; // maps nodes in in-HG to the + // equivalent nodes (many due to state + // splits) in the out-HG. + FFStates node_states_; // for each node in the out-HG what is + // its q function value? + const int pop_limit_; + const int strategy_; //switch Cube Pruning strategy: 1 normal, 2 fast (alg 2), 3 fast_2 (alg 3). (see: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010) +}; + +struct NoPruningRescorer { + NoPruningRescorer(const ModelSet& m, const SentenceMetadata &sm, const Hypergraph& i, Hypergraph* o) : + models(m), + smeta(sm), + in(i), + out(*o), + nodemap(i.nodes_.size()) { + if (!SILENT) cerr << " Rescoring forest (full intersection)\n"; + node_states_.reserve(kRESERVE_NUM_NODES); + } + + typedef unordered_map > State2NodeIndex; + + void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, size_t head_node_hash, State2NodeIndex* state2node) { + const int arity = in_edge.Arity(); + Hypergraph::TailNodeVector ends(arity); + for (int i = 0; i < arity; ++i) + ends[i] = nodemap[in_edge.tail_nodes_[i]].size(); + + Hypergraph::TailNodeVector tail_iter(arity, 0); + bool done = false; + while (!done) { + Hypergraph::TailNodeVector tail(arity); + for (int i = 0; i < arity; ++i) + tail[i] = nodemap[in_edge.tail_nodes_[i]][tail_iter[i]]; + Hypergraph::Edge* new_edge = out.AddEdge(in_edge, tail); + FFState head_state; + if (is_goal) { + assert(tail.size() == 1); + const FFState& ant_state = node_states_[tail.front()]; + models.AddFinalFeatures(ant_state, new_edge,smeta); + } else { + prob_t edge_estimate; // this is a full intersection, so we disregard this + models.AddFeaturesToEdge(smeta, out, node_states_, new_edge, &head_state, &edge_estimate); + } + int& head_plus1 = (*state2node)[head_state]; + if (!head_plus1) { + HG::Node* new_node = out.AddNode(in_edge.rule_->GetLHS()); + new_node->node_hash = cdec::HashNode(head_node_hash, head_state); // ID is combination of existing state + residual state + head_plus1 = new_node->id_ + 1; + node_states_.push_back(head_state); + nodemap[in_edge.head_node_].push_back(head_plus1 - 1); + } + const int head_index = head_plus1 - 1; + out.ConnectEdgeToHeadNode(new_edge->id_, head_index); + + int ii = 0; + for (; ii < arity; ++ii) { + ++tail_iter[ii]; + if (tail_iter[ii] < ends[ii]) break; + tail_iter[ii] = 0; + } + done = (ii == arity); + } + } + + void ProcessOneNode(const int node_num, const bool is_goal) { + State2NodeIndex state2node; + const Hypergraph::Node& node = in.nodes_[node_num]; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]]; + ExpandEdge(edge, is_goal, node.node_hash, &state2node); + } + } + + void Apply() { + int num_nodes = in.nodes_.size(); + int goal_id = num_nodes - 1; + int pregoal = goal_id - 1; + assert(in.nodes_[pregoal].out_edges_.size() == 1); + if (!SILENT) cerr << " "; + int has = 0; + for (int i = 0; i < in.nodes_.size(); ++i) { + if (!SILENT) { + int needs = (50 * i / in.nodes_.size()); + while (has < needs) { cerr << '.'; ++has; } + } + ProcessOneNode(i, i == goal_id); + } + if (!SILENT) cerr << endl; + } + + private: + const ModelSet& models; + const SentenceMetadata& smeta; + const Hypergraph& in; + Hypergraph& out; + + vector > nodemap; + FFStates node_states_; // for each node in the out-HG what is + // its q function value? +}; + +// each node in the graph has one of these, it keeps track of +void ApplyModelSet(const Hypergraph& in, + const SentenceMetadata& smeta, + const ModelSet& models, + const IntersectionConfiguration& config, + Hypergraph* out) { + //force exhaustive if there's no state req. for model + if (models.stateless() || config.algorithm == IntersectionConfiguration::FULL) { + NoPruningRescorer ma(models, smeta, in, out); // avoid overhead of best-first when no state + ma.Apply(); + } else if (config.algorithm == IntersectionConfiguration::CUBE + || config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING + || config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING_2) { + int pl = config.pop_limit; + const int max_pl_for_large=50; + if (pl > max_pl_for_large && in.nodes_.size() > 80000) { + pl = max_pl_for_large; + cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n"; + } + if (config.algorithm == IntersectionConfiguration::CUBE) { + CubePruningRescorer ma(models, smeta, in, pl, out); + ma.Apply(); + } + else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING){ + CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP); + ma.Apply(); + } + else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING_2){ + CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP_2); + ma.Apply(); + } + + } else { + cerr << "Don't understand intersection algorithm " << config.algorithm << endl; + exit(1); + } + out->is_linear_chain_ = in.is_linear_chain_; // TODO remove when this is computed + // automatically +} + diff --git a/decoder/apply_models.h b/decoder/apply_models.h new file mode 100644 index 000000000..19a4c7be2 --- /dev/null +++ b/decoder/apply_models.h @@ -0,0 +1,43 @@ +#ifndef _APPLY_MODELS_H_ +#define _APPLY_MODELS_H_ + +#include + +struct ModelSet; +struct Hypergraph; +struct SentenceMetadata; + +struct exhaustive_t {}; + +struct IntersectionConfiguration { +enum { + FULL, + CUBE, + FAST_CUBE_PRUNING, + FAST_CUBE_PRUNING_2, + N_ALGORITHMS +}; + + const int algorithm; // 0 = full intersection, 1 = cube pruning + const int pop_limit; // max number of pops off the heap at each node + IntersectionConfiguration(int alg, int k) : algorithm(alg), pop_limit(k) {} + IntersectionConfiguration(exhaustive_t /* t */) : algorithm(0), pop_limit() {} +}; + +inline std::ostream& operator<<(std::ostream& os, const IntersectionConfiguration& c) { + if (c.algorithm == 0) { os << "FULL"; } + else if (c.algorithm == 1) { os << "CUBE:k=" << c.pop_limit; } + else if (c.algorithm == 2) { os << "FAST_CUBE_PRUNING"; } + else if (c.algorithm == 3) { os << "FAST_CUBE_PRUNING_2"; } + else if (c.algorithm == 4) { os << "N_ALGORITHMS"; } + else os << "OTHER"; + return os; +} + +void ApplyModelSet(const Hypergraph& in, + const SentenceMetadata& smeta, + const ModelSet& models, + const IntersectionConfiguration& config, + Hypergraph* out); + +#endif diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc new file mode 100644 index 000000000..b30f1ec69 --- /dev/null +++ b/decoder/bottom_up_parser.cc @@ -0,0 +1,370 @@ +//TODO: when using many nonterminals, group passive edges for a span (treat all as a single X for the active items). + +//TODO: figure out what cdyer was talking about when he said that having unary rules A->B and B->A, doesn't make cycles appear in result provided rules are sorted in some way (that they typically are) + +#include "bottom_up_parser.h" + +#include +#include + +#include "node_state_hash.h" +#include "nt_span.h" +#include "hg.h" +#include "array2d.h" +#include "tdict.h" +#include "verbose.h" + +using namespace std; + +static WordID kEPS = 0; + +class ActiveChart; +class PassiveChart { + public: + PassiveChart(const string& goal, + const vector& grammars, + const Lattice& input, + Hypergraph* forest); + ~PassiveChart(); + + inline const vector& operator()(int i, int j) const { return chart_(i,j); } + bool Parse(); + inline int size() const { return chart_.width(); } + inline bool GoalFound() const { return goal_idx_ >= 0; } + inline int GetGoalIndex() const { return goal_idx_; } + + private: + void ApplyRules(const int i, + const int j, + const RuleBin* rules, + const Hypergraph::TailNodeVector& tail, + const float lattice_cost); + + void ApplyRule(const int i, + const int j, + const TRulePtr& r, + const Hypergraph::TailNodeVector& ant_nodes, + const float lattice_cost); + + void ApplyUnaryRules(const int i, const int j); + void TopoSortUnaries(); + + const vector& grammars_; + const Lattice& input_; + Hypergraph* forest_; + Array2D > chart_; // chart_(i,j) is the list of nodes derived spanning i,j + typedef map Cat2NodeMap; + Array2D nodemap_; + vector act_chart_; + const WordID goal_cat_; // category that is being searched for at [0,n] + TRulePtr goal_rule_; + int goal_idx_; // index of goal node, if found + const int lc_fid_; + vector unaries_; // topologically sorted list of unary rules from all grammars + + static WordID kGOAL; // [Goal] +}; + +WordID PassiveChart::kGOAL = 0; + +class ActiveChart { + public: + ActiveChart(const Hypergraph* hg, const PassiveChart& psv_chart) : + hg_(hg), + act_chart_(psv_chart.size(), psv_chart.size()), psv_chart_(psv_chart) {} + + struct ActiveItem { + ActiveItem(const GrammarIter* g, const Hypergraph::TailNodeVector& a, float lcost) : + gptr_(g), ant_nodes_(a), lattice_cost(lcost) {} + explicit ActiveItem(const GrammarIter* g) : + gptr_(g), ant_nodes_(), lattice_cost(0.0) {} + + void ExtendTerminal(int symbol, float src_cost, vector* out_cell) const { + if (symbol == kEPS) { + out_cell->push_back(ActiveItem(gptr_, ant_nodes_, lattice_cost + src_cost)); + } else { + const GrammarIter* ni = gptr_->Extend(symbol); + if (ni) + out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost)); + } + } + void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector* out_cell) const { + int symbol = hg->nodes_[node_index].cat_; + const GrammarIter* ni = gptr_->Extend(symbol); + if (!ni) return; + Hypergraph::TailNodeVector na(ant_nodes_.size() + 1); + for (unsigned i = 0; i < ant_nodes_.size(); ++i) + na[i] = ant_nodes_[i]; + na[ant_nodes_.size()] = node_index; + out_cell->push_back(ActiveItem(ni, na, lattice_cost)); + } + + const GrammarIter* gptr_; + Hypergraph::TailNodeVector ant_nodes_; + float lattice_cost; // TODO? use SparseVector + }; + + inline const vector& operator()(int i, int j) const { return act_chart_(i,j); } + void SeedActiveChart(const Grammar& g) { + int size = act_chart_.width(); + for (int i = 0; i < size; ++i) + if (g.HasRuleForSpan(i,i,0)) + act_chart_(i,i).push_back(ActiveItem(g.GetRoot())); + } + + void ExtendActiveItems(int i, int k, int j) { + //cerr << " LOOK(" << i << "," << k << ") for completed items in (" << k << "," << j << ")\n"; + vector& cell = act_chart_(i,j); + const vector& icell = act_chart_(i,k); + const vector& idxs = psv_chart_(k, j); + //if (!idxs.empty()) { cerr << "FOUND IN (" << k << "," << j << ")\n"; } + for (vector::const_iterator di = icell.begin(); di != icell.end(); ++di) { + for (vector::const_iterator ni = idxs.begin(); ni != idxs.end(); ++ni) { + di->ExtendNonTerminal(hg_, *ni, &cell); + } + } + } + + void AdvanceDotsForAllItemsInCell(int i, int j, const vector >& input) { + //cerr << "ADVANCE(" << i << "," << j << ")\n"; + for (int k=i+1; k < j; ++k) + ExtendActiveItems(i, k, j); + + const vector& out_arcs = input[j-1]; + for (vector::const_iterator ai = out_arcs.begin(); + ai != out_arcs.end(); ++ai) { + const WordID& f = ai->label; + const double& c = ai->cost; + const int& len = ai->dist2next; + //cerr << "F: " << TD::Convert(f) << " dest=" << i << "," << (j+len-1) << endl; + const vector& ec = act_chart_(i, j-1); + //cerr << " SRC=" << i << "," << (j-1) << " [ec=" << ec.size() << "]" << endl; + //if (ec.size() > 0) { cerr << " LC=" << ec[0].lattice_cost << endl; } + for (vector::const_iterator di = ec.begin(); di != ec.end(); ++di) + di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1)); + } + } + + private: + const Hypergraph* hg_; + Array2D > act_chart_; + const PassiveChart& psv_chart_; +}; + +PassiveChart::PassiveChart(const string& goal, + const vector& grammars, + const Lattice& input, + Hypergraph* forest) : + grammars_(grammars), + input_(input), + forest_(forest), + chart_(input.size()+1, input.size()+1), + nodemap_(input.size()+1, input.size()+1), + goal_cat_(TD::Convert(goal) * -1), + goal_rule_(new TRule("[Goal] ||| [" + goal + "] ||| [1]")), + goal_idx_(-1), + lc_fid_(FD::Convert("LatticeCost")), + unaries_() { + act_chart_.resize(grammars_.size()); + for (unsigned i = 0; i < grammars_.size(); ++i) { + act_chart_[i] = new ActiveChart(forest, *this); + const vector& u = grammars_[i]->GetAllUnaryRules(); + for (unsigned j = 0; j < u.size(); ++j) + unaries_.push_back(u[j]); + } + TopoSortUnaries(); + if (!kGOAL) kGOAL = TD::Convert("Goal") * -1; + if (!SILENT) cerr << " Goal category: [" << goal << ']' << endl; +} + +static bool TopoSortVisit(int node, vector& u, const map >& g, map& mark) { + if (mark[node] == 1) { + cerr << "[ERROR] Unary rule cycle detected involving [" << TD::Convert(-node) << "]\n"; + return false; // cycle detected + } else if (mark[node] == 2) { + return true; // already been + } + mark[node] = 1; + const map >::const_iterator nit = g.find(node); + if (nit != g.end()) { + const vector& edges = nit->second; + vector okay(edges.size(), true); + for (unsigned i = 0; i < edges.size(); ++i) { + okay[i] = TopoSortVisit(edges[i]->lhs_, u, g, mark); + if (!okay[i]) { + cerr << "[ERROR] Unary rule cycle detected, removing: " << edges[i]->AsString() << endl; + } + } + for (unsigned i = 0; i < edges.size(); ++i) { + if (okay[i]) u.push_back(edges[i]); + //if (okay[i]) cerr << "UNARY: " << edges[i]->AsString() << endl; + } + } + mark[node] = 2; + return true; +} + +void PassiveChart::TopoSortUnaries() { + vector u(unaries_.size()); u.clear(); + map > g; + map mark; + //cerr << "GOAL=" << TD::Convert(-goal_cat_) << endl; + mark[goal_cat_] = 2; + for (unsigned i = 0; i < unaries_.size(); ++i) { + //cerr << "Adding: " << unaries_[i]->AsString() << endl; + g[unaries_[i]->f()[0]].push_back(unaries_[i]); + } + //m[unaries_[i]->lhs_].push_back(unaries_[i]); + for (map >::iterator it = g.begin(); it != g.end(); ++it) { + //cerr << "PROC: " << TD::Convert(-it->first) << endl; + if (mark[it->first] > 0) { + //cerr << "Already saw [" << TD::Convert(-it->first) << "]\n"; + } else { + TopoSortVisit(it->first, u, g, mark); + } + } + unaries_.clear(); + for (int i = u.size() - 1; i >= 0; --i) + unaries_.push_back(u[i]); +} + +void PassiveChart::ApplyRule(const int i, + const int j, + const TRulePtr& r, + const Hypergraph::TailNodeVector& ant_nodes, + const float lattice_cost) { + Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes); + // cerr << i << " " << j << ": APPLYING RULE: " << r->AsString() << endl; + new_edge->prev_i_ = r->prev_i; + new_edge->prev_j_ = r->prev_j; + new_edge->i_ = i; + new_edge->j_ = j; + new_edge->feature_values_ = r->GetFeatureValues(); + if (lattice_cost && lc_fid_) + new_edge->feature_values_.set_value(lc_fid_, lattice_cost); + Cat2NodeMap& c2n = nodemap_(i,j); + const bool is_goal = (r->GetLHS() == kGOAL); + const Cat2NodeMap::iterator ni = c2n.find(r->GetLHS()); + Hypergraph::Node* node = NULL; + if (ni == c2n.end()) { + node = forest_->AddNode(r->GetLHS()); + c2n[r->GetLHS()] = node->id_; + if (is_goal) { + assert(goal_idx_ == -1); + goal_idx_ = node->id_; + } else { + chart_(i,j).push_back(node->id_); + } + } else { + node = &forest_->nodes_[ni->second]; + } + forest_->ConnectEdgeToHeadNode(new_edge, node); +} + +void PassiveChart::ApplyRules(const int i, + const int j, + const RuleBin* rules, + const Hypergraph::TailNodeVector& tail, + const float lattice_cost) { + const int n = rules->GetNumRules(); + //cerr << i << " " << j << ": NUM RULES: " << n << endl; + for (int k = 0; k < n; ++k) { + //cerr << i << " " << j << ": R=" << rules->GetIthRule(k)->AsString() << endl; + ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost); + } +} + +void PassiveChart::ApplyUnaryRules(const int i, const int j) { + const vector& nodes = chart_(i,j); // reference is important! + for (unsigned di = 0; di < nodes.size(); ++di) { + const WordID& cat = forest_->nodes_[nodes[di]].cat_; + for (unsigned ri = 0; ri < unaries_.size(); ++ri) { + //cerr << "At (" << i << "," << j << "): applying " << unaries_[ri]->AsString() << endl; + if (unaries_[ri]->f()[0] == cat) { + //cerr << " --MATCH\n"; + const Hypergraph::TailNodeVector ant(1, nodes[di]); + ApplyRule(i, j, unaries_[ri], ant, 0); // may update nodes + } + } + } +} + +bool PassiveChart::Parse() { + size_t in_size_2 = input_.size() * input_.size(); + forest_->nodes_.reserve(in_size_2 * 2); + size_t res = min(static_cast(2000000), static_cast(in_size_2 * 1000)); + forest_->edges_.reserve(res); + goal_idx_ = -1; + for (unsigned gi = 0; gi < grammars_.size(); ++gi) + act_chart_[gi]->SeedActiveChart(*grammars_[gi]); + + if (!SILENT) cerr << " "; + for (unsigned l=1; lAdvanceDotsForAllItemsInCell(i, j, input_); + + const vector& cell = (*act_chart_[gi])(i,j); + for (vector::const_iterator ai = cell.begin(); + ai != cell.end(); ++ai) { + const RuleBin* rules = (ai->gptr_->GetRules()); + if (!rules) continue; + ApplyRules(i, j, rules, ai->ant_nodes_, ai->lattice_cost); + } + } + } + ApplyUnaryRules(i,j); + + for (unsigned gi = 0; gi < grammars_.size(); ++gi) { + const Grammar& g = *grammars_[gi]; + // deal with non-terminals that were just proved + if (g.HasRuleForSpan(i, j, input_.Distance(i,j))) + act_chart_[gi]->ExtendActiveItems(i, i, j); + } + } + const vector& dh = chart_(0, input_.size()); + for (unsigned di = 0; di < dh.size(); ++di) { + const Hypergraph::Node& node = forest_->nodes_[dh[di]]; + if (node.cat_ == goal_cat_) { + Hypergraph::TailNodeVector ant(1, node.id_); + ApplyRule(0, input_.size(), goal_rule_, ant, 0); + } + } + } + if (!SILENT) cerr << endl; + + if (GoalFound()) + forest_->PruneUnreachable(forest_->nodes_.size() - 1); + return GoalFound(); +} + +PassiveChart::~PassiveChart() { + for (unsigned i = 0; i < act_chart_.size(); ++i) + delete act_chart_[i]; +} + +ExhaustiveBottomUpParser::ExhaustiveBottomUpParser( + const string& goal_sym, + const vector& grammars) : + goal_sym_(goal_sym), + grammars_(grammars) {} + +bool ExhaustiveBottomUpParser::Parse(const Lattice& input, + Hypergraph* forest) const { + kEPS = TD::Convert("*EPS*"); + PassiveChart chart(goal_sym_, grammars_, input, forest); + const bool result = chart.Parse(); + + if (result) { + for (auto& node : forest->nodes_) { + Span prev; + const Span s = forest->NodeSpan(node.id_, &prev); + node.node_hash = cdec::HashNode(node.cat_, s.l, s.r, prev.l, prev.r); + } + } + return result; +} diff --git a/decoder/bottom_up_parser.h b/decoder/bottom_up_parser.h new file mode 100644 index 000000000..546bfb54a --- /dev/null +++ b/decoder/bottom_up_parser.h @@ -0,0 +1,27 @@ +#ifndef _BOTTOM_UP_PARSER_H_ +#define _BOTTOM_UP_PARSER_H_ + +#include +#include + +#include "lattice.h" +#include "grammar.h" + +class Hypergraph; + +class ExhaustiveBottomUpParser { + public: + ExhaustiveBottomUpParser(const std::string& goal_sym, + const std::vector& grammars); + + // returns true if goal reached spanning the full input + // forest contains the full (i.e., unpruned) parse forest + bool Parse(const Lattice& input, + Hypergraph* forest) const; + + private: + const std::string goal_sym_; + const std::vector grammars_; +}; + +#endif diff --git a/decoder/cdec.cc b/decoder/cdec.cc new file mode 100644 index 000000000..cc3fcff11 --- /dev/null +++ b/decoder/cdec.cc @@ -0,0 +1,47 @@ +#include + +#include "filelib.h" +#include "decoder.h" +#include "ff_register.h" +#include "verbose.h" +#include "timing_stats.h" +#include "util/usage.hh" + +using namespace std; + +int main(int argc, char** argv) { + register_feature_functions(); + Decoder decoder(argc, argv); + + const string input = decoder.GetConf()["input"].as(); + const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); + if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + + string buf; +#ifdef CP_TIME + clock_t time_cp(0);//, end_cp; +#endif + while(*in) { + getline(*in, buf); + if (buf.empty()) continue; + decoder.Decode(buf); + } + Timer::Summarize(); +#ifdef CP_TIME + cerr << "Time required for Cube Pruning execution: " + << CpTime::Get() + << " seconds." << "\n\n"; +#endif + if (show_feature_dictionary) { + int num = FD::NumFeats(); + for (int i = 1; i < num; ++i) { + cout << FD::Convert(i) << endl; + } + } + util::PrintUsage(std::cerr); + return 0; +} + diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc new file mode 100644 index 000000000..0411908f2 --- /dev/null +++ b/decoder/cdec_ff.cc @@ -0,0 +1,81 @@ +#include + +#include "ff.h" +#include "ff_basic.h" +#include "ff_context.h" +#include "ff_spans.h" +#include "ff_lm.h" +#include "ff_klm.h" +#include "ff_ngrams.h" +#include "ff_csplit.h" +#include "ff_wordalign.h" +#include "ff_tagger.h" +#include "ff_factory.h" +#include "ff_rules.h" +#include "ff_ruleshape.h" +#include "ff_bleu.h" +#include "ff_soft_syntax.h" +#include "ff_soft_syntax_mindist.h" +#include "ff_source_path.h" +#include "ff_parse_match.h" +#include "ff_source_syntax.h" +#include "ff_source_syntax2.h" +#include "ff_register.h" +#include "ff_charset.h" +#include "ff_wordset.h" +#include "ff_external.h" + + +void register_feature_functions() { + static bool registered = false; + if (registered) { + assert(!"register_feature_functions() called twice!"); + } + registered = true; + + RegisterFF(); + + RegisterFF(); + RegisterFF(); + RegisterFF(); + RegisterFF(); + + //TODO: use for all features the new Register which requires static FF::usage(false,false) give name + ff_registry.Register("SpanFeatures", new FFFactory()); + ff_registry.Register("NgramFeatures", new FFFactory()); + ff_registry.Register("RuleContextFeatures", new FFFactory()); + ff_registry.Register("RuleIdentityFeatures", new FFFactory()); + ff_registry.Register("RuleWordAlignmentFeatures", new FFFactory()); + ff_registry.Register("ParseMatchFeatures", new FFFactory); + ff_registry.Register("SoftSyntaxFeatures", new FFFactory); + ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory); + ff_registry.Register("SourceSyntaxFeatures", new FFFactory); + ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); + ff_registry.Register("SourceSyntaxFeatures2", new FFFactory); + ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory()); + ff_registry.Register("RuleSourceBigramFeatures", new FFFactory()); + ff_registry.Register("RuleTargetBigramFeatures", new FFFactory()); + ff_registry.Register("KLanguageModel", new KLanguageModelFactory()); + ff_registry.Register("NonLatinCount", new FFFactory); + ff_registry.Register("RuleShape", new FFFactory); + ff_registry.Register("RuleShape2", new FFFactory); + ff_registry.Register("RelativeSentencePosition", new FFFactory); + ff_registry.Register("LexNullJump", new FFFactory); + ff_registry.Register("NewJump", new FFFactory); + ff_registry.Register("SourceBigram", new FFFactory); + ff_registry.Register("Fertility", new FFFactory); + ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory); + ff_registry.Register("CSplit_BasicFeatures", new FFFactory); + ff_registry.Register("CSplit_ReverseCharLM", new FFFactory); + ff_registry.Register("Tagger_BigramIndicator", new FFFactory); + ff_registry.Register("LexicalPairIndicator", new FFFactory); + ff_registry.Register("OutputIndicator", new FFFactory); + ff_registry.Register("IdentityCycleDetector", new FFFactory); + ff_registry.Register("InputIndicator", new FFFactory); + ff_registry.Register("LexicalTranslationTrigger", new FFFactory); + ff_registry.Register("WordPairFeatures", new FFFactory); + ff_registry.Register("SourcePathFeatures", new FFFactory); + ff_registry.Register("WordSet", new FFFactory); + ff_registry.Register("External", new FFFactory); +} + diff --git a/decoder/csplit.cc b/decoder/csplit.cc new file mode 100644 index 000000000..4a723822b --- /dev/null +++ b/decoder/csplit.cc @@ -0,0 +1,175 @@ +#include "csplit.h" + +#include + +#include "filelib.h" +#include "stringlib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct CompoundSplitImpl { + CompoundSplitImpl(const boost::program_options::variables_map& conf) : + fugen_elements_(true), + min_size_(3), + kXCAT(TD::Convert("X")*-1), + kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")), + kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")), + kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")), + kFUGEN_S(FD::Convert("FugS")), + kFUGEN_N(FD::Convert("FugN")) { + // TODO: use conf to turn fugenelements on and off + } + + void PasteTogetherStrings(const vector& chars, + const int i, + const int j, + string* yield) { + int size = 0; + for (int k=i; kresize(size); + int cur = 0; + for (int k=i; k& chars, + Hypergraph* forest) { + vector nodes(chars.size()+1, -1); + nodes[0] = forest->AddNode(kXCAT)->id_; // source + const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; + forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); + + const int max_split_ = max(static_cast(chars.size()) - min_size_ + 1, 1); + // cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl; + for (int i = min_size_; i < max_split_; ++i) + nodes[i] = forest->AddNode(kXCAT)->id_; + assert(nodes.back() == -1); + nodes.back() = forest->AddNode(kXCAT)->id_; // sink + + for (int i = 0; i < max_split_; ++i) { + if (nodes[i] < 0) continue; + const int start = min(i + min_size_, static_cast(chars.size())); + for (int j = start; j <= chars.size(); ++j) { + if (nodes[j] < 0) continue; + string yield; + PasteTogetherStrings(chars, i, j, &yield); + // cerr << "[" << i << "," << j << "] " << yield << endl; + TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); + rule->e_[1] = rule->f_[1] = TD::Convert(yield); + // cerr << rule->AsString() << endl; + int edge = forest->AddEdge( + rule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + + // handle "fugenelemente" here + // don't delete "fugenelemente" at the end of words + if (fugen_elements_ && j != chars.size()) { + const int len = yield.size(); + string alt; + int fid = 0; + if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { + alt = yield.substr(0, len - 2); + fid = kFUGEN_S; + } else if (len > (min_size_ + 1) && yield[len-1] == 's') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_S; + } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_N; + } + if (alt.size()) { + TRulePtr altrule = TRulePtr(new TRule(*rule)); + altrule->e_[1] = TD::Convert(alt); + // cerr << altrule->AsString() << endl; + int edge = forest->AddEdge( + altrule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].feature_values_.set_value(fid, 1.0); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + } + } + } + } + + // add goal rule + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); + } + private: + const bool fugen_elements_; + const int min_size_; + const WordID kXCAT; + const TRulePtr kWORDBREAK_RULE; + const TRulePtr kTEMPLATE_RULE; + const TRulePtr kGOAL_RULE; + const int kFUGEN_S; + const int kFUGEN_N; +}; + +CompoundSplit::CompoundSplit(const boost::program_options::variables_map& conf) : + pimpl_(new CompoundSplitImpl(conf)) {} + +static void SplitUTF8String(const string& in, vector* out) { + out->resize(in.size()); + int i = 0; + int c = 0; + while (i < in.size()) { + const int len = UTF8Len(in[i]); + assert(len); + (*out)[c] = in.substr(i, len); + ++c; + i += len; + } + out->resize(c); +} + +bool CompoundSplit::TranslateImpl(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* forest) { + if (input.find(" ") != string::npos) { + cerr << " BAD INPUT: " << input << "\n CompoundSplit expects single words\n"; + abort(); + } + vector in; + SplitUTF8String(input, &in); + smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign + for (int i = 0; i < in.size(); ++i) + smeta->src_lattice_.push_back(vector(1, LatticeArc(TD::Convert(in[i]), 0.0, 1))); + pimpl_->BuildTrellis(in, forest); + forest->Reweight(weights); + return true; +} + +int CompoundSplit::GetFullWordEdgeIndex(const Hypergraph& forest) { + assert(forest.nodes_.size() > 0); + const vector out_edges = forest.nodes_[0].out_edges_; + int max_edge = -1; + int max_j = -1; + for (int i = 0; i < out_edges.size(); ++i) { + const int j = forest.edges_[out_edges[i]].j_; + if (j > max_j) { + max_j = j; + max_edge = out_edges[i]; + } + } + assert(max_edge >= 0); + assert(max_edge < forest.edges_.size()); + return max_edge; +} + diff --git a/decoder/csplit.h b/decoder/csplit.h new file mode 100644 index 000000000..82ed23fc8 --- /dev/null +++ b/decoder/csplit.h @@ -0,0 +1,30 @@ +#ifndef _CSPLIT_H_ +#define _CSPLIT_H_ + +#include "translator.h" +#include "lattice.h" + +// this "translator" takes single words (with NO SPACES) and segments +// them using the approach described in: +// +// C. Dyer. (2009) Using a maximum entropy model to build segmentation +// lattices for MT. In Proceedings of NAACL HLT 2009. +// note, an extra word space marker # is inserted at the left edge of +// the forest! +struct CompoundSplitImpl; +struct CompoundSplit : public Translator { + CompoundSplit(const boost::program_options::variables_map& conf); + bool TranslateImpl(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* forest); + + // given a forest generated by CompoundSplit::Translate, + // find the edge representing the unsegmented form + static int GetFullWordEdgeIndex(const Hypergraph& forest); + + private: + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/decoder.cc b/decoder/decoder.cc new file mode 100644 index 000000000..6783cad0d --- /dev/null +++ b/decoder/decoder.cc @@ -0,0 +1,1100 @@ +#include "decoder.h" + +#ifndef HAVE_OLD_CPP +# include +#else +# include +namespace std { using std::tr1::unordered_map; } +#endif +#include +#include +#include +#include + +#include "stringlib.h" +#include "weights.h" +#include "filelib.h" +#include "fdict.h" +#include "timing_stats.h" +#include "verbose.h" + +#include "translator.h" +#include "phrasebased_translator.h" +#include "tagger.h" +#include "lextrans.h" +#include "lexalign.h" +#include "csplit.h" + +#include "lattice.h" +#include "hg.h" +#include "sentence_metadata.h" +#include "hg_intersect.h" +#include "hg_union.h" + +#include "oracle_bleu.h" +#include "apply_models.h" +#include "ff.h" +#include "ffset.h" +#include "ff_factory.h" +#include "viterbi.h" +#include "kbest.h" +#include "inside_outside.h" +#include "exp_semiring.h" +#include "sentence_metadata.h" +#include "sampler.h" + +#include "forest_writer.h" // TODO this section should probably be handled by an Observer +#include "incremental.h" +#include "hg_io.h" +#include "aligner.h" + +#ifdef CP_TIME + clock_t CpTime::time_; + void CpTime::Add(clock_t x){time_+=x;} + void CpTime::Sub(clock_t x){time_-=x;} + double CpTime::Get(){return (double)(time_)/CLOCKS_PER_SEC;} +#endif + +static const double kMINUS_EPSILON = -1e-6; // don't be too strict + +using namespace std; +namespace po = boost::program_options; + +static bool verbose_feature_functions=true; + +namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } +namespace NgramCache { void Clear(); } + +DecoderObserver::~DecoderObserver() {} +void DecoderObserver::NotifyDecodingStart(const SentenceMetadata&) {} +void DecoderObserver::NotifySourceParseFailure(const SentenceMetadata&) {} +void DecoderObserver::NotifyTranslationForest(const SentenceMetadata&, Hypergraph*) {} +void DecoderObserver::NotifyAlignmentFailure(const SentenceMetadata&) {} +void DecoderObserver::NotifyAlignmentForest(const SentenceMetadata&, Hypergraph*) {} +void DecoderObserver::NotifyDecodingComplete(const SentenceMetadata&) {} + +enum SummaryFeature { + kNODE_RISK = 1, + kEDGE_RISK, + kEDGE_PROB +}; + + +struct ELengthWeightFunction { + double operator()(const Hypergraph::Edge& e) const { + return e.rule_->ELength() - e.rule_->Arity(); + } +}; +inline void ShowBanner() { + cerr << "cdec (c) 2009--2014 by Chris Dyer\n"; +} + +inline string str(char const* name,po::variables_map const& conf) { + return conf[name].as(); +} + + +// print just the --long_opt names suitable for bash compgen +inline void print_options(std::ostream &out,po::options_description const& opts) { + typedef std::vector< boost::shared_ptr > Ds; + Ds const& ds=opts.options(); + out << '"'; + for (unsigned i=0;ilong_name(); + } + out << '"'; +} + +template +inline bool store_conf(po::variables_map const& conf,std::string const& name,V *v) { + if (conf.count(name)) { + *v=conf[name].as(); + return true; + } + return false; +} + +inline boost::shared_ptr make_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { + string ff, param; + SplitCommandAndParam(ffp, &ff, ¶m); + if (verbose_feature_functions && !SILENT) + cerr << pre << "feature: " << ff; + if (!SILENT) { + if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; + else cerr << " (no config parameters)\n"; + } + boost::shared_ptr pf = ff_registry.Create(ff, param); + if (!pf) exit(1); + int nbyte=pf->StateSize(); + if (verbose_feature_functions && !SILENT) + cerr<<"State is "< models; + boost::shared_ptr inter_conf; + vector ffs; + boost::shared_ptr > weight_vector; + int fid_summary; // 0 == no summary feature + double density_prune; // 0 == don't density prune + double beam_prune; // 0 == don't beam prune +}; + +ostream& operator<<(ostream& os, const RescoringPass& rp) { + os << "[num_fn=" << rp.ffs.size(); + if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; } + //if (rp.weight_vector.size() > 0) os << " new_weights"; + if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary); + if (rp.density_prune) os << " density_prune=" << rp.density_prune; + if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune; + os << ']'; + return os; +} + +struct DecoderImpl { + DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg); + ~DecoderImpl(); + bool Decode(const string& input, DecoderObserver*); + vector& CurrentWeightVector() { + return (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector); + } + void SetId(int next_sent_id) { sent_id = next_sent_id - 1; } + + void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_deriv=false, bool extract_rules=false, boost::shared_ptr extract_file = boost::make_shared()) { + cerr << viterbi_stats(forest,name,true,show_tree,show_deriv,extract_rules, extract_file); + cerr << endl; + } + + bool beam_param(po::variables_map const& conf,string const& name,double *val,bool scale_srclen=false,double srclen=1) { + if (conf.count(name)) { + *val=conf[name].as()*(scale_srclen?srclen:1); + return true; + } + return false; + } + + void maybe_prune(Hypergraph &forest,po::variables_map const& conf,string nbeam,string ndensity,string forestname,double srclen) { + double beam_prune=0,density_prune=0; + bool use_beam_prune=beam_param(conf,nbeam,&beam_prune,conf.count("scale_prune_srclen"),srclen); + bool use_density_prune=beam_param(conf,ndensity,&density_prune); + if (use_beam_prune || use_density_prune) { + double presize=forest.edges_.size(); + vector preserve_mask,*pm=0; + if (conf.count("csplit_preserve_full_word")) { + preserve_mask.resize(forest.edges_.size()); + preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; + pm=&preserve_mask; + } + forest.PruneInsideOutside(beam_prune,density_prune,pm,false,1); + if (!forestname.empty()) forestname=" "+forestname; + if (!SILENT) { + forest_stats(forest," Pruned "+forestname+" forest",false,false); + cerr << " Pruned "< >& ss, int n, vector* out) { + const SampleSet& s = ss[n]; + int i = rng->SelectSample(s); + const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; + vector > ants(edge.tail_nodes_.size()); + for (int j = 0; j < ants.size(); ++j) + SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); + + vector*> pants(ants.size()); + for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; + edge.rule_->ESubstitute(pants, out); + } + + struct SampleSort { + bool operator()(const pair& a, const pair& b) const { + return a.first > b.first; + } + }; + + // TODO this should be handled by an Observer + void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { + unordered_map > m; + hg->PushWeightsToGoal(); + const int num_nodes = hg->nodes_.size(); + vector > ss(num_nodes); + for (int i = 0; i < num_nodes; ++i) { + SampleSet& s = ss[i]; + const vector& in_edges = hg->nodes_[i].in_edges_; + for (int j = 0; j < in_edges.size(); ++j) { + s.add(hg->edges_[in_edges[j]].edge_prob_); + } + } + for (int i = 0; i < samples; ++i) { + vector yield; + SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); + const string trans = TD::GetString(yield); + ++m[trans]; + } + vector > dist; + for (unordered_map >::iterator i = m.begin(); + i != m.end(); ++i) { + dist.push_back(make_pair(i->second, i->first)); + } + sort(dist.begin(), dist.end(), SampleSort()); + if (k) { + for (int i = 0; i < k; ++i) + cout << dist[i].first << " ||| " << dist[i].second << endl; + } else { + cout << dist[0].second << endl; + } + } + + void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { + string sref; + ParseTranslatorInput(line, input, &sref); + if (sref.size() > 0) { + assert(ref); + LatticeTools::ConvertTextOrPLF(sref, ref); + } + } + + // used to construct the suffix string to get the name of arguments for multiple passes + // e.g., the "2" in --weights2 + static string StringSuffixForRescoringPass(int pass) { + if (pass == 0) return ""; + string ps = "1"; + assert(pass < 9); + ps[0] += pass; + return ps; + } + + vector rescoring_passes; + + po::variables_map& conf; + OracleBleu oracle; + string formalism; + boost::shared_ptr translator; + boost::shared_ptr > init_weights; // weights used with initial parse + vector > pffs; + boost::shared_ptr > rng; + int sample_max_trans; + bool aligner_mode; + bool graphviz; + bool joshua_viz; + bool encode_b64; + bool kbest; + bool unique_kbest; + bool get_oracle_forest; + boost::shared_ptr extract_file; + int combine_size; + int sent_id; + SparseVector acc_vec; // accumulate gradient + double acc_obj; // accumulate objective + int g_count; // number of gradient pieces computed + bool csplit_output_plf; + bool write_gradient; // TODO Observer + bool feature_expectations; // TODO Observer + bool output_training_vector; // TODO Observer + bool remove_intersected_rule_annotations; + boost::scoped_ptr incremental; + + + static void ConvertSV(const SparseVector& src, SparseVector* trg) { + for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) + trg->set_value(it->first, it->second.as_float()); + } +}; + +DecoderImpl::~DecoderImpl() { + if (output_training_vector && !acc_vec.empty()) { + if (encode_b64) { + cout << "0\t"; + SparseVector dav; ConvertSV(acc_vec, &dav); + B64::Encode(acc_obj, dav, &cout); + cout << endl << flush; + } else { + cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; + } + } +} + +DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg) : conf(conf) { + if (cfg) { if (argc || argv) { cerr << "DecoderImpl() can only take a file or command line options, not both\n"; exit(1); } } + bool show_config; + bool show_weights; + vector cfg_files; + + po::options_description opts("Configuration options"); + opts.add_options() + ("formalism,f",po::value(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)") + ("input,i",po::value()->default_value("-"),"Source file") + ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") + ("per_sentence_grammar_file", po::value(), "Optional (and possibly not implemented) per sentence grammar file enables all per sentence grammars to be stored in a single large file and accessed by offset") + ("list_feature_functions,L","List available feature functions") +#ifdef HAVE_CMPH + ("cmph_perfect_feature_hash,h", po::value(), "Load perfect hash function for features") +#endif + + ("weights,w",po::value(),"Feature weights file (initial forest / pass 1)") + ("feature_function,F",po::value >()->composing(), "Pass 1 additional feature function(s) (-L for list)") + ("intersection_strategy,I",po::value()->default_value("cube_pruning"), "Pass 1 intersection strategy for incorporating finite-state features; values include Cube_pruning, Full, Fast_cube_pruning, Fast_cube_pruning_2") + ("cubepruning_pop_limit,K",po::value()->default_value(200), "Max number of pops from the candidate heap at each node") + ("summary_feature", po::value(), "Compute a 'summary feature' at the end of the pass (before any pruning) with name=arg and value=inside-outside/Z") + ("summary_feature_type", po::value()->default_value("node_risk"), "Summary feature types: node_risk, edge_risk, edge_prob") + ("density_prune", po::value(), "Pass 1 pruning: keep no more than this many times the number of edges used in the best derivation tree (>=1.0)") + ("beam_prune", po::value(), "Pass 1 pruning: Prune paths from scored forest, keep paths within exp(alpha>=0)") + + ("weights2",po::value(),"Optional pass 2") + ("feature_function2",po::value >()->composing(), "Optional pass 2") + ("intersection_strategy2",po::value()->default_value("cube_pruning"), "Optional pass 2") + ("cubepruning_pop_limit2",po::value()->default_value(200), "Optional pass 2") + ("summary_feature2", po::value(), "Optional pass 2") + ("density_prune2", po::value(), "Optional pass 2") + ("beam_prune2", po::value(), "Optional pass 2") + + ("weights3",po::value(),"Optional pass 3") + ("feature_function3",po::value >()->composing(), "Optional pass 3") + ("intersection_strategy3",po::value()->default_value("cube_pruning"), "Optional pass 3") + ("cubepruning_pop_limit3",po::value()->default_value(200), "Optional pass 3") + ("summary_feature3", po::value(), "Optional pass 3") + ("density_prune3", po::value(), "Optional pass 3") + ("beam_prune3", po::value(), "Optional pass 3") + + ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") + ("k_best,k",po::value(),"Extract the k best derivations") + ("unique_k_best,r", "Unique k-best translation list") + ("aligner,a", "Run as a word/phrase aligner (src & ref required)") + ("aligner_use_viterbi", "If run in alignment mode, compute the Viterbi (rather than MAP) alignment") + ("goal",po::value()->default_value("S"),"Goal symbol (SCFG & FST)") + ("freeze_feature_set,Z", "Freeze feature set after reading feature weights file") + ("warn_0_weight","Warn about any feature id that has a 0 weight (this is perfectly safe if you intend 0 weight, though)") + ("scfg_extra_glue_grammar", po::value(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") + ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") + ("scfg_default_nt,d",po::value()->default_value("X"),"Default non-terminal symbol in SCFG") + ("scfg_max_span_limit,S",po::value()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") + ("quiet", "Disable verbose output") + ("show_config", po::bool_switch(&show_config), "show contents of loaded -c config files.") + ("show_weights", po::bool_switch(&show_weights), "show effective feature weights") + ("show_feature_dictionary", "After decoding the last input, write the contents of the feature dictionary") + ("show_joshua_visualization,J", "Produce output compatible with the Joshua visualization tools") + ("show_tree_structure", "Show the Viterbi derivation structure") + ("show_expected_length", "Show the expected translation length under the model") + ("show_partition,z", "Compute and show the partition (inside score)") + ("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation") + ("show_cfg_search_space", "Show the search space as a CFG") + ("show_cfg_alignment_space", "Show the alignment hypergraph as a CFG") + ("show_target_graph", po::value(), "Directory to write the target hypergraphs to") + ("incremental_search", po::value(), "Run lazy search with this language model file") + ("coarse_to_fine_beam_prune", po::value(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)") + ("ctf_beam_widen", po::value()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found") + ("ctf_num_widenings", po::value()->default_value(2), "Widen coarse beam this many times before backing off to full parse") + ("ctf_no_exhaustive", "Do not fall back to exhaustive parse if coarse-to-fine parsing fails") + ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices") + ("lextrans_dynasearch", "'DynaSearch' neighborhood instead of usual partition, as defined by Smith & Eisner (2005)") + ("lextrans_use_null", "Support source-side null words in lexical translation") + ("lextrans_align_only", "Only used in alignment mode. Limit target words generated by reference") + ("tagger_tagset,t", po::value(), "(Tagger) file containing tag set") + ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") + ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") + ("extract_rules", po::value(), "Extract the rules used in translation (not de-duped!) to a file in this directory") + ("show_derivations", po::value(), "Directory to print the derivation structures to") + ("graphviz","Show (constrained) translation forest in GraphViz format") + ("max_translation_beam,x", po::value(), "Beam approximation to get max translation from the chart") + ("max_translation_sample,X", po::value(), "Sample the max translation from the chart") + ("pb_max_distortion,D", po::value()->default_value(4), "Phrase-based decoder: maximum distortion") + ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)") + ("get_oracle_forest,o", "Calculate rescored hypergraph using approximate BLEU scoring of rules") + ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") + ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") + ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") + ("forest_output,O",po::value(),"Directory to write forests to") + ("remove_intersected_rule_annotations", "After forced decoding is completed, remove nonterminal annotations (i.e., the source side spans)"); + + // ob.AddOptions(&opts); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value >(&cfg_files), "Configuration file(s) - latest has priority") + ("help,?", "Print this help message and exit") + ("usage,u", po::value(), "Describe a feature function type") + ("compgen", "Print just option names suitable for bash command line completion builtin 'compgen'") + ; + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + + dcmdline_options.add(dconfig_options).add(clo); + if (argc) { + po::store(parse_command_line(argc, argv, dcmdline_options), conf); + if (conf.count("compgen")) { + print_options(cout,dcmdline_options); + cout << endl; + exit(0); + } + if (conf.count("quiet")) + SetSilent(true); + if (!SILENT) ShowBanner(); + } + if (conf.count("show_config")) // special handling needed because we only want to notify() once. + show_config=true; + if (conf.count("config") && !cfg) { + typedef vector Cs; + Cs cs=conf["config"].as(); + for (int i=0;i() << " ...\n"; + FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); + cerr << " " << FD::NumFeats() << " features in map\n"; + } + + // load initial feature weights (and possibly freeze feature set) + init_weights.reset(new vector); + if (conf.count("weights")) + Weights::InitFromFile(str("weights",conf), init_weights.get()); + + if (conf.count("extract_rules")) { + if (!DirectoryExists(conf["extract_rules"].as())) + MkDirP(conf["extract_rules"].as()); + } + + // determine the number of rescoring/pruning/weighting passes configured + const int MAX_PASSES = 3; + for (int pass = 0; pass < MAX_PASSES; ++pass) { + string ws = "weights" + StringSuffixForRescoringPass(pass); + string ff = "feature_function" + StringSuffixForRescoringPass(pass); + string sf = "summary_feature" + StringSuffixForRescoringPass(pass); + string bp = "beam_prune" + StringSuffixForRescoringPass(pass); + string dp = "density_prune" + StringSuffixForRescoringPass(pass); + bool first_pass_condition = ((pass == 0) && (conf.count(ff) || conf.count(bp) || conf.count(dp))); + bool nth_pass_condition = ((pass > 0) && (conf.count(ws) || conf.count(ff) || conf.count(bp) || conf.count(dp))); + if (first_pass_condition || nth_pass_condition) { + rescoring_passes.push_back(RescoringPass()); + RescoringPass& rp = rescoring_passes.back(); + // only configure new weights if pass > 0, otherwise we reuse the initial chart weights + if (nth_pass_condition && conf.count(ws)) { + rp.weight_vector.reset(new vector()); + Weights::InitFromFile(str(ws.c_str(), conf), rp.weight_vector.get()); + } + bool has_stateful = false; + if (conf.count(ff)) { + vector add_ffs; + store_conf(conf,ff,&add_ffs); + for (int i = 0; i < add_ffs.size(); ++i) { + pffs.push_back(make_ff(add_ffs[i],verbose_feature_functions)); + FeatureFunction const* p=pffs.back().get(); + rp.ffs.push_back(p); + if (p->IsStateful()) { has_stateful = true; } + } + } + if (conf.count(sf)) { + rp.fid_summary = FD::Convert(conf[sf].as()); + assert(rp.fid_summary > 0); + // TODO assert that weights for this pass have coef(fid_summary) == 0.0? + } + if (conf.count(bp)) { rp.beam_prune = conf[bp].as(); } + if (conf.count(dp)) { rp.density_prune = conf[dp].as(); } + int palg = (has_stateful ? 1 : 0); // if there are no stateful featueres, default to FULL + string isn = "intersection_strategy" + StringSuffixForRescoringPass(pass); + string spl = "cubepruning_pop_limit" + StringSuffixForRescoringPass(pass); + unsigned pop_limit = 200; + if (conf.count(spl)) { pop_limit = conf[spl].as(); } + if (LowercaseString(str(isn.c_str(),conf)) == "full") { + palg = 0; + } + if (LowercaseString(conf["intersection_strategy"].as()) == "fast_cube_pruning") { + palg = 2; + cerr << "Using Fast Cube Pruning intersection (see Algorithm 2 described in: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010).\n"; + } + if (LowercaseString(conf["intersection_strategy"].as()) == "fast_cube_pruning_2") { + palg = 3; + cerr << "Using Fast Cube Pruning 2 intersection (see Algorithm 3 described in: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010).\n"; + } + rp.inter_conf.reset(new IntersectionConfiguration(palg, pop_limit)); + } else { + break; // TODO alert user if there are any future configurations + } + } + + // set up weight vectors since later phases may reuse weights from earlier phases + boost::shared_ptr > prev_weights = init_weights; + for (int pass = 0; pass < rescoring_passes.size(); ++pass) { + RescoringPass& rp = rescoring_passes[pass]; + if (!rp.weight_vector) { + rp.weight_vector = prev_weights; + } else { + prev_weights = rp.weight_vector; + } + rp.models.reset(new ModelSet(*rp.weight_vector, rp.ffs)); + } + + // show configuration of rescoring passes + if (!SILENT) { + int num = rescoring_passes.size(); + cerr << "Configured " << num << " rescoring pass" << (num == 1 ? "" : "es") << endl; + for (int pass = 0; pass < num; ++pass) + cerr << " " << rescoring_passes[pass] << endl; + } + + bool warn0=conf.count("warn_0_weight"); + bool freeze=conf.count("freeze_feature_set"); + bool early_freeze=freeze && !warn0; + bool late_freeze=freeze && warn0; + if (early_freeze) { + cerr << "Freezing feature set" << endl; + FD::Freeze(); // this means we can't see the feature names of not-weighted features + } + + // set up translation back end + if (formalism == "scfg") + translator.reset(new SCFGTranslator(conf)); + else if (formalism == "t2s") + translator.reset(new Tree2StringTranslator(conf, false)); + else if (formalism == "t2t") + translator.reset(new Tree2StringTranslator(conf, true)); + else if (formalism == "fst") + translator.reset(new FSTTranslator(conf)); + else if (formalism == "pb") + translator.reset(new PhraseBasedTranslator(conf)); + else if (formalism == "csplit") + translator.reset(new CompoundSplit(conf)); + else if (formalism == "lextrans") + translator.reset(new LexicalTrans(conf)); + else if (formalism == "lexalign") + translator.reset(new LexicalAlign(conf)); + else if (formalism == "rescore") + translator.reset(new RescoreTranslator(conf)); + else if (formalism == "tagger") + translator.reset(new Tagger(conf)); + else + assert(!"error"); + + if (late_freeze) { + cerr << "Late freezing feature set (use --no_freeze_feature_set to prevent)." << endl; + FD::Freeze(); // this means we can't see the feature names of not-weighted features + } + + sample_max_trans = conf.count("max_translation_sample") ? + conf["max_translation_sample"].as() : 0; + if (sample_max_trans) + rng.reset(new RandomNumberGenerator); + aligner_mode = conf.count("aligner"); + graphviz = conf.count("graphviz"); + joshua_viz = conf.count("show_joshua_visualization"); + encode_b64 = str("vector_format",conf) == "b64"; + kbest = conf.count("k_best"); + unique_kbest = conf.count("unique_k_best"); + get_oracle_forest = conf.count("get_oracle_forest"); + oracle.show_derivation=conf.count("show_derivations"); + remove_intersected_rule_annotations = conf.count("remove_intersected_rule_annotations"); + + combine_size = conf["combine_size"].as(); + if (combine_size < 1) combine_size = 1; + sent_id = -1; + acc_obj = 0; // accumulate objective + g_count = 0; // number of gradient pieces computed + + if (conf.count("incremental_search")) { + incremental.reset(IncrementalBase::Load(conf["incremental_search"].as().c_str(), CurrentWeightVector())); + } +} + +Decoder::Decoder(istream* cfg) { pimpl_.reset(new DecoderImpl(conf,0,0,cfg)); } +Decoder::Decoder(int argc, char** argv) { pimpl_.reset(new DecoderImpl(conf,argc, argv, 0)); } +Decoder::~Decoder() {} +void Decoder::SetId(int next_sent_id) { pimpl_->SetId(next_sent_id); } +bool Decoder::Decode(const string& input, DecoderObserver* o) { + bool del = false; + if (!o) { o = new DecoderObserver; del = true; } + const bool res = pimpl_->Decode(input, o); + if (del) delete o; + return res; +} +vector& Decoder::CurrentWeightVector() { return pimpl_->CurrentWeightVector(); } +const vector& Decoder::CurrentWeightVector() const { return pimpl_->CurrentWeightVector(); } +void Decoder::AddSupplementalGrammar(GrammarPtr gp) { + static_cast(*pimpl_->translator).AddSupplementalGrammar(gp); +} +void Decoder::AddSupplementalGrammarFromString(const std::string& grammar_string) { + assert(pimpl_->translator->GetDecoderType() == "SCFG"); + static_cast(*pimpl_->translator).AddSupplementalGrammarFromString(grammar_string); +} + +bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { + string buf = input; + NgramCache::Clear(); // clear ngram cache for remote LM (if used) + Timer::Summarize(); + ++sent_id; + map sgml; + ProcessAndStripSGML(&buf, &sgml); + if (sgml.find("id") != sgml.end()) + sent_id = atoi(sgml["id"].c_str()); + + if (!SILENT) { + cerr << "\nINPUT: "; + if (buf.size() < 100) + cerr << buf << endl; + else { + size_t x = buf.rfind(" ", 100); + if (x == string::npos) x = 100; + cerr << buf.substr(0, x) << " ..." << endl; + } + cerr << " id = " << sent_id << endl; + } + if (conf.count("extract_rules")) { + stringstream ss; + ss << sent_id << ".gz"; + extract_file.reset(new WriteFile(str("extract_rules",conf)+"/"+ss.str())); + } + string to_translate; + Lattice ref; + ParseTranslatorInputLattice(buf, &to_translate, &ref); + const unsigned srclen=NTokens(to_translate,' '); +//FIXME: should get the avg. or max source length of the input lattice (like Lattice::dist_(start,end)); but this is only used to scale beam parameters (optionally) anyway so fidelity isn't important. + const bool has_ref = ref.size() > 0; + SentenceMetadata smeta(sent_id, ref); + smeta.sgml_.swap(sgml); + o->NotifyDecodingStart(smeta); + Hypergraph forest; // -LM forest + translator->ProcessMarkupHints(smeta.sgml_); + Timer t("Translation"); + const bool translation_successful = + translator->Translate(to_translate, &smeta, *init_weights, &forest); + translator->SentenceComplete(); + + if (!translation_successful) { + if (!SILENT) { cerr << " NO PARSE FOUND.\n"; } + o->NotifySourceParseFailure(smeta); + o->NotifyDecodingComplete(smeta); + if (conf.count("show_conditional_prob")) { + cout << "-Inf" << endl << flush; + } else if (!SILENT) { + cout << endl; + } + return false; + } + + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } + + if (!forest.ArePreGoalEdgesArity1()) { + cerr << "Pre-goal edges are not arity-1. The decoder requires this.\n"; + abort(); + } + + const bool show_tree_structure=conf.count("show_tree_structure"); + if (!SILENT) forest_stats(forest," Init. forest",show_tree_structure,oracle.show_derivation); + if (conf.count("show_expected_length")) { + const PRPair res = + Inside, + PRWeightFunction >(forest); + cerr << " Expected length (words): " << (res.r / res.p).as_float() << "\t" << res << endl; + } + + if (conf.count("show_partition")) { + const prob_t z = Inside(forest); + cerr << " Partition log(Z): " << log(z) << endl; + } + + SummaryFeature summary_feature_type = kNODE_RISK; + if (conf["summary_feature_type"].as() == "edge_risk") + summary_feature_type = kEDGE_RISK; + else if (conf["summary_feature_type"].as() == "node_risk") + summary_feature_type = kNODE_RISK; + else if (conf["summary_feature_type"].as() == "edge_prob") + summary_feature_type = kEDGE_PROB; + else { + cerr << "Bad summary_feature_type: " << conf["summary_feature_type"].as() << endl; + abort(); + } + + if (conf.count("show_target_graph")) { + HypergraphIO::WriteTarget(conf["show_target_graph"].as(), sent_id, forest); + } + if (conf.count("incremental_search")) { + incremental->Search(conf["cubepruning_pop_limit"].as(), forest); + } + if (conf.count("show_target_graph") || conf.count("incremental_search")) { + o->NotifyDecodingComplete(smeta); + return true; + } + + for (int pass = 0; pass < rescoring_passes.size(); ++pass) { + const RescoringPass& rp = rescoring_passes[pass]; + const vector& cur_weights = *rp.weight_vector; + if (!SILENT) cerr << endl << " RESCORING PASS #" << (pass+1) << " " << rp << endl; + + string passtr = "Pass1"; passtr[4] += pass; + forest.Reweight(cur_weights); + const bool has_rescoring_models = !rp.models->empty(); + if (has_rescoring_models) { + Timer t("Forest rescoring:"); + rp.models->PrepareForInput(smeta); + Hypergraph rescored_forest; +#ifdef CP_TIME + CpTime::Sub(clock()); +#endif + ApplyModelSet(forest, + smeta, + *rp.models, + *rp.inter_conf, + &rescored_forest); +#ifdef CP_TIME + CpTime::Add(clock()); +#endif + forest.swap(rescored_forest); + forest.Reweight(cur_weights); + if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation, conf.count("extract_rules"), extract_file); + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } + } + + if (conf.count("show_partition")) { + const prob_t z = Inside(forest); + cerr << " " << passtr << " partition log(Z): " << log(z) << endl; + } + + if (rp.fid_summary) { + if (summary_feature_type == kEDGE_PROB) { + const prob_t z = forest.PushWeightsToGoal(1.0); + if (!std::isfinite(log(z)) || std::isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < forest.edges_.size(); ++i) { + const double log_prob_transition = log(forest.edges_[i].edge_prob_); // locally normalized by the edge + // head node by forest.PushWeightsToGoal + if (!std::isfinite(log_prob_transition) || std::isnan(log_prob_transition)) { + cerr << "Edge: i=" << i << " got bad inside prob: " << *forest.edges_[i].rule_ << endl; + abort(); + } + + forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition); + } + forest.Reweight(cur_weights); // reset weights + } + } else if (summary_feature_type == kNODE_RISK) { + Hypergraph::EdgeProbs posts; + const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts); + if (!std::isfinite(log(z)) || std::isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < forest.nodes_.size(); ++i) { + const Hypergraph::EdgesVector& in_edges = forest.nodes_[i].in_edges_; + prob_t node_post = prob_t(0); + for (int j = 0; j < in_edges.size(); ++j) + node_post += (posts[in_edges[j]] / z); + const double log_np = log(node_post); + if (!std::isfinite(log_np) || std::isnan(log_np)) { + cerr << "got bad posterior prob for node " << i << endl; + abort(); + } + for (int j = 0; j < in_edges.size(); ++j) + forest.edges_[in_edges[j]].feature_values_.set_value(rp.fid_summary, exp(log_np)); +// Hypergraph::Edge& example_edge = forest.edges_[in_edges[0]]; +// string n = "NONE"; +// if (forest.nodes_[i].cat_) n = TD::Convert(-forest.nodes_[i].cat_); +// cerr << "[" << n << "," << example_edge.i_ << "," << example_edge.j_ << "] = " << exp(log_np) << endl; + } + } + } else if (summary_feature_type == kEDGE_RISK) { + Hypergraph::EdgeProbs posts; + const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts); + if (!std::isfinite(log(z)) || std::isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + assert(posts.size() == forest.edges_.size()); + for (int i = 0; i < posts.size(); ++i) { + const double log_np = log(posts[i] / z); + if (!std::isfinite(log_np) || std::isnan(log_np)) { + cerr << "got bad posterior prob for node " << i << endl; + abort(); + } + forest.edges_[i].feature_values_.set_value(rp.fid_summary, exp(log_np)); + } + } + } else { + assert(!"shouldn't happen"); + } + } + + string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); + string fulldp = "density_prune" + StringSuffixForRescoringPass(pass); + maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen); + } + + const vector& last_weights = (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector); + + // Oracle Rescoring + if(get_oracle_forest) { + assert(!"this is broken"); SparseVector dummy; // = last_weights + Oracle oc=oracle.ComputeOracle(smeta,&forest,dummy,10,conf["forest_output"].as()); + if (!SILENT) cerr << " +Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; + if (!SILENT) cerr << " +Oracle BLEU (paths): " << forest.NumberOfPaths() << endl; + oc.hope.Print(cerr," +Oracle BLEU"); + oc.fear.Print(cerr," -Oracle BLEU"); + //Add 1-best translation (trans) to psuedo-doc vectors + if (!SILENT) oracle.IncludeLastScore(&cerr); + } + o->NotifyTranslationForest(smeta, &forest); + + // TODO I think this should probably be handled by an Observer + if (conf.count("forest_output") && !has_ref) { + ForestWriter writer(str("forest_output",conf), sent_id); + if (FileExists(writer.fname_)) { + if (!SILENT) cerr << " Unioning...\n"; + Hypergraph new_hg; + { + ReadFile rf(writer.fname_); + bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); + if (!succeeded) abort(); + } + HG::Union(forest, &new_hg); + bool succeeded = writer.Write(new_hg, false); + if (!succeeded) abort(); + } else { + bool succeeded = writer.Write(forest, false); + if (!succeeded) abort(); + } + } + + // TODO I think this should probably be handled by an Observer + if (sample_max_trans) { + MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as() : 0); + } else { + if (kbest && !has_ref) { + //TODO: does this work properly? + const string deriv_fname = conf.count("show_derivations") ? str("show_derivations",conf) : "-"; + oracle.DumpKBest(sent_id, forest, conf["k_best"].as(), unique_kbest,"-", deriv_fname); + } else if (csplit_output_plf) { + cout << HypergraphIO::AsPLF(forest, false) << endl; + } else { + if (!graphviz && !has_ref && !joshua_viz && !SILENT) { + vector trans; + ViterbiESentence(forest, &trans); + cout << TD::GetString(trans) << endl << flush; + } + if (joshua_viz) { + cout << sent_id << " ||| " << JoshuaVisualizationString(forest) << " ||| 1.0 ||| " << -1.0 << endl << flush; + } + } + } + + prob_t first_z; + if (conf.count("show_conditional_prob")) { + first_z = Inside(forest); + } + + // TODO this should be handled by an Observer + const int max_trans_beam_size = conf.count("max_translation_beam") ? + conf["max_translation_beam"].as() : 0; + if (max_trans_beam_size) { + Hack::MaxTrans(forest, max_trans_beam_size); + return true; + } + + // TODO this should be handled by an Observer + if (graphviz && !has_ref) forest.PrintGraphviz(); + + // the following are only used if write_gradient is true! + SparseVector full_exp, ref_exp, gradient; + double log_z = 0, log_ref_z = 0; + if (write_gradient) { + const prob_t z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &full_exp); + log_z = log(z); + full_exp /= z; + } + if (conf.count("show_cfg_search_space")) + HypergraphIO::WriteAsCFG(forest); + if (has_ref) { + if (HG::Intersect(ref, &forest)) { +// if (crf_uniform_empirical) { +// if (!SILENT) cerr << " USING UNIFORM WEIGHTS\n"; +// for (int i = 0; i < forest.edges_.size(); ++i) +// forest.edges_[i].edge_prob_=prob_t::One(); } + if (remove_intersected_rule_annotations) { + for (unsigned i = 0; i < forest.edges_.size(); ++i) + if (forest.edges_[i].rule_ && + forest.edges_[i].rule_->parent_rule_) + forest.edges_[i].rule_ = forest.edges_[i].rule_->parent_rule_; + } + forest.Reweight(last_weights); + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } + if (!SILENT) forest_stats(forest," Constr. forest",show_tree_structure,oracle.show_derivation); + if (!SILENT) cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; + if (conf.count("show_partition")) { + const prob_t z = Inside(forest); + cerr << " Contst. partition log(Z): " << log(z) << endl; + } + o->NotifyAlignmentForest(smeta, &forest); + if (conf.count("show_cfg_alignment_space")) + HypergraphIO::WriteAsCFG(forest); + if (conf.count("forest_output")) { + ForestWriter writer(str("forest_output",conf), sent_id); + if (FileExists(writer.fname_)) { + if (!SILENT) cerr << " Unioning...\n"; + Hypergraph new_hg; + { + ReadFile rf(writer.fname_); + bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); + if (!succeeded) abort(); + } + HG::Union(forest, &new_hg); + bool succeeded = writer.Write(new_hg, false); + if (!succeeded) abort(); + } else { + bool succeeded = writer.Write(forest, false); + if (!succeeded) abort(); + } + } + if (aligner_mode && !output_training_vector) + AlignerTools::WriteAlignment(smeta.GetSourceLattice(), smeta.GetReference(), forest, &cout, 0 == conf.count("aligner_use_viterbi"), kbest ? conf["k_best"].as() : 0); + if (write_gradient) { + const prob_t ref_z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); + ref_exp /= ref_z; +// if (crf_uniform_empirical) +// log_ref_z = ref_exp.dot(last_weights); + log_ref_z = log(ref_z); + //cerr << " MODEL LOG Z: " << log_z << endl; + //cerr << " EMPIRICAL LOG Z: " << log_ref_z << endl; + if ((log_z - log_ref_z) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; + exit(1); + } + assert(!std::isnan(log_ref_z)); + ref_exp -= full_exp; + acc_vec += ref_exp; + acc_obj += (log_z - log_ref_z); + } + if (feature_expectations) { + const prob_t z = + InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); + ref_exp /= z; + acc_obj += log(z); + acc_vec += ref_exp; + } + + if (output_training_vector) { + acc_vec.erase(0); + ++g_count; + if (g_count % combine_size == 0) { + if (encode_b64) { + cout << "0\t"; + SparseVector dav; ConvertSV(acc_vec, &dav); + B64::Encode(acc_obj, dav, &cout); + cout << endl << flush; + } else { + cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; + } + acc_vec.clear(); + acc_obj = 0; + } + } + if (conf.count("graphviz")) forest.PrintGraphviz(); + if (kbest) { + const string deriv_fname = conf.count("show_derivations") ? str("show_derivations",conf) : "-"; + oracle.DumpKBest(sent_id, forest, conf["k_best"].as(), unique_kbest,"-", deriv_fname); + } + if (conf.count("show_conditional_prob")) { + const prob_t ref_z = Inside(forest); + cout << (log(ref_z) - log(first_z)) << endl << flush; + } + } else { + o->NotifyAlignmentFailure(smeta); + if (!SILENT) cerr << " REFERENCE UNREACHABLE.\n"; + if (write_gradient) { + cout << endl << flush; + } + if (conf.count("show_conditional_prob")) { + cout << "-Inf" << endl << flush; + } + } + } + o->NotifyDecodingComplete(smeta); + return true; +} + diff --git a/decoder/decoder.h b/decoder/decoder.h new file mode 100644 index 000000000..8039a42b6 --- /dev/null +++ b/decoder/decoder.h @@ -0,0 +1,69 @@ +#ifndef _DECODER_H_ +#define _DECODER_H_ + +#include +#include +#include +#include +#include + +#include "weights.h" // weight_t + +#undef CP_TIME +//#define CP_TIME +#ifdef CP_TIME +#include +struct CpTime{ +public: + static void Add(clock_t x); + static void Sub(clock_t x); + static double Get(); +private: + static clock_t time_; +}; +#endif + +class SentenceMetadata; +class Hypergraph; +class DecoderImpl; + +class DecoderObserver { + public: + virtual ~DecoderObserver(); + virtual void NotifyDecodingStart(const SentenceMetadata& smeta); + virtual void NotifySourceParseFailure(const SentenceMetadata& smeta); + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg); + virtual void NotifyAlignmentFailure(const SentenceMetadata& semta); + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg); + virtual void NotifyDecodingComplete(const SentenceMetadata& smeta); +}; + +class Grammar; // TODO once the decoder interface is cleaned up, + // this should be somewhere else +class Decoder { + public: + Decoder(int argc, char** argv); + Decoder(std::istream* config_file); + bool Decode(const std::string& input, DecoderObserver* observer = NULL); + + // access this to either *read* or *write* to the decoder's last + // weight vector (i.e., the weights of the finest past) + std::vector& CurrentWeightVector(); + const std::vector& CurrentWeightVector() const; + + // this sets the current sentence ID + void SetId(int id); + ~Decoder(); + const boost::program_options::variables_map& GetConf() const { return conf; } + + // add grammar rules (currently only supported by SCFG decoders) + // that will be used on subsequent calls to Decode. rules should be in standard + // text format. This function does NOT read from a file. + void AddSupplementalGrammar(boost::shared_ptr gp); + void AddSupplementalGrammarFromString(const std::string& grammar_string); + private: + boost::program_options::variables_map conf; + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc new file mode 100644 index 000000000..d47a69699 --- /dev/null +++ b/decoder/earley_composer.cc @@ -0,0 +1,761 @@ +#include "earley_composer.h" + +#include +#include +#include +#include +#ifndef HAVE_OLD_CPP +# include +# include +#else +# include +# include +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_multiset; using std::tr1::unordered_set; } +#endif + +#include +#include +#include +#include "fast_lexical_cast.hpp" + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" +#include "hg_remove_eps.h" + +using namespace std; + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { + if (constants_need_init) { + kPHRASE = TD::Convert(kPHRASE_STRING) * -1; + kUNIQUE_START = TD::Convert("S") * -1; + kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); + kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); + kEPSRule.reset(new TRule("[X] ||| ||| ")); + kEPS = TD::Convert(""); + constants_need_init = false; + } +} +//////////////////////////////////////////////////////////// + +TRulePtr CreateBinaryRule(int lhs, int rhs1, int rhs2) { + TRule* r = new TRule(*kX1X2); + r->lhs_ = lhs; + r->f_[0] = rhs1; + r->f_[1] = rhs2; + return TRulePtr(r); +} + +TRulePtr CreateUnaryRule(int lhs, int rhs1) { + TRule* r = new TRule(*kX1); + r->lhs_ = lhs; + r->f_[0] = rhs1; + return TRulePtr(r); +} + +TRulePtr CreateEpsilonRule(int lhs) { + TRule* r = new TRule(*kEPSRule); + r->lhs_ = lhs; + return TRulePtr(r); +} + +class EGrammarNode { + friend bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + friend void AddGrammarRule(const string& r, map* g); + public: +#ifdef DEBUG_CHART_PARSER + string hint; +#endif + EGrammarNode() : is_some_rule_complete(false), is_root(false) {} + const map& GetTerminals() const { return tptr; } + const map& GetNonTerminals() const { return ntptr; } + bool HasNonTerminals() const { return (!ntptr.empty()); } + bool HasTerminals() const { return (!tptr.empty()); } + bool RuleCompletes() const { + return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); + } + bool GrammarContinues() const { + return !(ntptr.empty() && tptr.empty()); + } + bool IsRoot() const { + return is_root; + } + // these are the features associated with the rule from the start + // node up to this point. If you use these features, you must + // not Extend() this rule. + const SparseVector& GetCFGProductionFeatures() const { + return input_features; + } + + const EGrammarNode* Extend(const WordID& t) const { + if (t < 0) { + map::const_iterator it = ntptr.find(t); + if (it == ntptr.end()) return NULL; + return &it->second; + } else { + map::const_iterator it = tptr.find(t); + if (it == tptr.end()) return NULL; + return &it->second; + } + } + + private: + map tptr; + map ntptr; + SparseVector input_features; + bool is_some_rule_complete; + bool is_root; +}; +typedef map EGrammar; // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER + static int id_count; + const int id; +#endif + const WordID cat; // lhs side of rule proved/being proved + const EGrammarNode* const dot; // dot position + const FSTNode* const q; // start of span + const FSTNode* const r; // end of span + const Edge* const active_parent; // back pointer, NULL for PREDICT items + const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items + const TargetPhraseSet* const tps; // translations + boost::shared_ptr > features; // features from CFG rule + + bool IsPassive() const { + // when a rule is completed, this value will be set + return static_cast(features); + } + bool IsActive() const { return !IsPassive(); } + bool IsInitial() const { + return !(active_parent || passive_parent); + } + bool IsCreatedByScan() const { + return active_parent && !passive_parent && !dot->IsRoot(); + } + bool IsCreatedByPredict() const { + return dot->IsRoot(); + } + bool IsCreatedByComplete() const { + return active_parent && passive_parent; + } + + // constructor for PREDICT + Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} + Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps(NULL) {} + + // constructors for SCAN + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const TargetPhraseSet* translations) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const TargetPhraseSet* translations, + const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), + features(new SparseVector(feats)) {} + + // constructors for COMPLETE + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL), + features(new SparseVector(feats)) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + // constructor for COMPLETE query + Edge(const FSTNode* _r) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(NULL), + r(_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} + // constructor for MERGE quere + Edge(const FSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(_q), + r(NULL), active_parent(NULL), passive_parent(NULL), tps(NULL) {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { + string type = "PREDICT"; + if (e.IsCreatedByScan()) + type = "SCAN"; + else if (e.IsCreatedByComplete()) + type = "COMPLETE"; + os << "[" +#ifdef DEBUG_CHART_PARSER + << '(' << e.id << ") " +#else + << '(' << &e << ") " +#endif + << "q=" << e.q << ", r=" << e.r + << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" + << e.dot +#ifdef DEBUG_CHART_PARSER + << e.dot->hint +#endif + << (e.IsActive() ? ", Active" : ", Passive") + << ", " << type; +#ifdef DEBUG_CHART_PARSER + if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } + if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif + if (e.tps) { os << ", tps=" << e.tps; } + return os << ']'; +} + +struct Traversal { + const Edge* const edge; // result from the active / passive combination + const Edge* const active; + const Edge* const passive; + Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { + size_t operator()(const Traversal* t) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(t->active); + x = ((x << 5) + x) ^ reinterpret_cast(t->passive); + x = ((x << 5) + x) ^ t->edge->IsActive(); + return x; + } +}; + +struct UniqueTraversalEquals { + size_t operator()(const Traversal* a, const Traversal* b) const { + return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); + } +}; + +struct UniqueEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + if (e->IsActive()) { + x = ((x << 5) + x) ^ reinterpret_cast(e->dot); + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + x += 13; + } else { // with passive edges, we don't care about the dot + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + } + return x; + } +}; + +struct UniqueEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + if (a->IsActive() != b->IsActive()) return false; + if (a->IsActive()) { + return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); + } else { + return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); + } + } +}; + +struct REdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + return x; + } +}; + +struct REdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->r == b->r); + } +}; + +struct QEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + return x; + } +}; + +struct QEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->q == b->q); + } +}; + +struct EdgeQueue { + queue q; + EdgeQueue() {} + void clear() { while(!q.empty()) q.pop(); } + bool HasWork() const { return !q.empty(); } + const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } + void AddEdge(const Edge* s) { q.push(s); } +}; + +class EarleyComposerImpl { + public: + EarleyComposerImpl(WordID start_cat, const FSTNode& q_0) : start_cat_(start_cat), q_0_(&q_0) {} + + // returns false if the intersection is empty + bool Compose(const EGrammar& g, Hypergraph* forest) { + goal_node = NULL; + EGrammar::const_iterator sit = g.find(start_cat_); + forest->ReserveNodes(kMAX_NODES); + assert(sit != g.end()); + Edge* init = new Edge(start_cat_, &sit->second, q_0_); + if (!IncorporateNewEdge(init)) { + cerr << "Failed to create initial edge!\n"; + abort(); + } + while (exp_agenda.HasWork() || agenda.HasWork()) { + while(exp_agenda.HasWork()) { + const Edge* edge = exp_agenda.Next(); + FinishEdge(edge, forest); + } + if (agenda.HasWork()) { + const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER + cerr << "processing (" << edge->id << ')' << endl; +#endif + if (edge->IsActive()) { + if (edge->dot->HasTerminals()) + DoScan(edge); + if (edge->dot->HasNonTerminals()) { + DoMergeWithPassives(edge); + DoPredict(edge, g); + } + } else { + DoComplete(edge); + } + } + } + if (goal_node) { + forest->PruneUnreachable(goal_node->id_); + RemoveEpsilons(forest, kEPS); + } + FreeAll(); + return goal_node; + } + + void FreeAll() { + for (int i = 0; i < free_list_.size(); ++i) + delete free_list_[i]; + free_list_.clear(); + for (int i = 0; i < traversal_free_list_.size(); ++i) + delete traversal_free_list_[i]; + traversal_free_list_.clear(); + all_traversals.clear(); + exp_agenda.clear(); + agenda.clear(); + tps2node.clear(); + edge2node.clear(); + all_edges.clear(); + passive_edges.clear(); + active_edges.clear(); + } + + ~EarleyComposerImpl() { + FreeAll(); + } + + // returns the total number of edges created during composition + int EdgesCreated() const { + return free_list_.size(); + } + + private: + void DoScan(const Edge* edge) { + // here, we assume that the FST will potentially have many more outgoing + // edges than the grammar, which will be just a couple. If you want to + // efficiently handle the case where both are relatively large, this code + // will need to change how the intersection is done. The best general + // solution would probably be the Baeza-Yates double binary search. + + const EGrammarNode* dot = edge->dot; + const FSTNode* r = edge->r; + const map& terms = dot->GetTerminals(); + for (map::const_iterator git = terms.begin(); + git != terms.end(); ++git) { + const FSTNode* next_r = r->Extend(git->first); + if (!next_r) continue; + const EGrammarNode* next_dot = &git->second; + const bool grammar_continues = next_dot->GrammarContinues(); + const bool rule_completes = next_dot->RuleCompletes(); + assert(grammar_continues || rule_completes); + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // create up to 4 new edges! + if (next_r->HasOutgoingNonEpsilonEdges()) { // are there further symbols in the FST? + const TargetPhraseSet* translations = NULL; + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations)); + } + if (next_r->HasData()) { // indicates a loop back to q_0 in the FST + const TargetPhraseSet* translations = next_r->GetTranslations(); + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations)); + } + } + } + + void DoPredict(const Edge* edge, const EGrammar& g) { + const EGrammarNode* dot = edge->dot; + const map& non_terms = dot->GetNonTerminals(); + for (map::const_iterator git = non_terms.begin(); + git != non_terms.end(); ++git) { + const WordID nt_to_predict = git->first; + //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; + EGrammar::const_iterator egi = g.find(nt_to_predict); + if (egi == g.end()) { + cerr << "[ERROR] Can't find any grammar rules with a LHS of type " + << TD::Convert(-1*nt_to_predict) << '!' << endl; + continue; + } + assert(edge->IsActive()); + const EGrammarNode* new_dot = &egi->second; + Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); + IncorporateNewEdge(new_edge); + } + } + + void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER + cerr << " complete: " << *passive << endl; +#endif + const WordID completed_nt = passive->cat; + const FSTNode* q = passive->q; + const FSTNode* next_r = passive->r; + const Edge query(q); + const pair::iterator, + unordered_multiset::iterator > p = + active_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER + cerr << " pos: " << *active << endl; +#endif + const EGrammarNode* next_dot = active->dot->Extend(completed_nt); + if (!next_dot) continue; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // add up to 2 rules + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + void DoMergeWithPassives(const Edge* active) { + // edge is active, has non-terminals, we need to find the passives that can extend it + assert(active->IsActive()); + assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER + cerr << " merge active with passives: ACT=" << *active << endl; +#endif + const Edge query(active->r, 1); + const pair::iterator, + unordered_multiset::iterator > p = + passive_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* passive = *it; + const EGrammarNode* next_dot = active->dot->Extend(passive->cat); + if (!next_dot) continue; + const FSTNode* next_r = passive->r; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + // take ownership of edge memory, add to various indexes, etc + // returns true if this edge is new + bool IncorporateNewEdge(Edge* edge) { + free_list_.push_back(edge); + if (edge->passive_parent && edge->active_parent) { + Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); + traversal_free_list_.push_back(t); + if (all_traversals.find(t) != all_traversals.end()) { + return false; + } else { + all_traversals.insert(t); + } + } + exp_agenda.AddEdge(edge); + return true; + } + + bool FinishEdge(const Edge* edge, Hypergraph* hg) { + bool is_new = false; + if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NEW\n"; +#endif + all_edges.insert(edge); + is_new = true; + if (edge->IsPassive()) passive_edges.insert(edge); + if (edge->IsActive()) active_edges.insert(edge); + agenda.AddEdge(edge); + } else { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NOT NEW.\n"; +#endif + } + AddEdgeToTranslationForest(edge, hg); + return is_new; + } + + // build the translation forest + void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { + assert(hg->nodes_.size() < kMAX_NODES); + Hypergraph::Node* tps = NULL; + // first add any target language rules + if (edge->tps) { + Hypergraph::Node*& node = tps2node[(size_t)edge->tps]; + if (!node) { + // cerr << "Creating phrases for " << edge->tps << endl; + const vector& rules = edge->tps->GetRules(); + node = hg->AddNode(kPHRASE); + for (int i = 0; i < rules.size(); ++i) { + Hypergraph::Edge* hg_edge = hg->AddEdge(rules[i], Hypergraph::TailNodeVector()); + hg_edge->feature_values_ += rules[i]->GetFeatureValues(); + hg->ConnectEdgeToHeadNode(hg_edge, node); + } + } + tps = node; + } + Hypergraph::Node*& head_node = edge2node[edge]; + if (!head_node) + head_node = hg->AddNode(edge->cat); + if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_0_ && edge->IsPassive()) { + assert(goal_node == NULL || goal_node == head_node); + goal_node = head_node; + } + int rhs1 = 0; + int rhs2 = 0; + Hypergraph::TailNodeVector tail; + SparseVector extra; + if (edge->IsCreatedByPredict()) { + // extra.set_value(FD::Convert("predict"), 1); + } else if (edge->IsCreatedByScan()) { + tail.push_back(edge2node[edge->active_parent]->id_); + rhs1 = edge->active_parent->cat; + if (tps) { + tail.push_back(tps->id_); + rhs2 = kPHRASE; + } + //extra.set_value(FD::Convert("scan"), 1); + } else if (edge->IsCreatedByComplete()) { + tail.push_back(edge2node[edge->active_parent]->id_); + rhs1 = edge->active_parent->cat; + tail.push_back(edge2node[edge->passive_parent]->id_); + rhs2 = edge->passive_parent->cat; + //extra.set_value(FD::Convert("complete"), 1); + } else { + assert(!"unexpected edge type!"); + } + //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER + for (int i = 0; i < tail.size(); ++i) + if (tail[i] == head_node->id_) { + cerr << "ERROR: " << *edge << "\n i=" << i << endl; + if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } + if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } + assert(!"self-loop found!"); + } +#endif + Hypergraph::Edge* hg_edge = NULL; + if (tail.size() == 0) { + hg_edge = hg->AddEdge(CreateEpsilonRule(edge->cat), tail); + } else if (tail.size() == 1) { + hg_edge = hg->AddEdge(CreateUnaryRule(edge->cat, rhs1), tail); + } else if (tail.size() == 2) { + hg_edge = hg->AddEdge(CreateBinaryRule(edge->cat, rhs1, rhs2), tail); + } + if (edge->features) + hg_edge->feature_values_ += *edge->features; + hg_edge->feature_values_ += extra; + hg->ConnectEdgeToHeadNode(hg_edge, head_node); + } + + Hypergraph::Node* goal_node; + EdgeQueue exp_agenda; + EdgeQueue agenda; + unordered_map tps2node; + unordered_map edge2node; + unordered_set all_traversals; + unordered_set all_edges; + unordered_multiset passive_edges; + unordered_multiset active_edges; + vector free_list_; + vector traversal_free_list_; + const WordID start_cat_; + const FSTNode* const q_0_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { + size_t start = r.find(" |||") + 5; + size_t end = r.rfind(" |||"); + return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { + const size_t pos = r.find(" ||| "); + if (pos == string::npos || r[0] != '[') { + cerr << "Bad rule: " << r << endl; + return; + } + const size_t rpos = r.rfind(" ||| "); + string feats; + string rs = r; + if (rpos != pos) { + feats = r.substr(rpos + 5); + rs = r.substr(0, rpos); + } + string rhs = rs.substr(pos + 5); + string trule = rs + " ||| " + rhs + " ||| " + feats; + TRule tr(trule); +#ifdef DEBUG_CHART_PARSER + string hint_last_rule; +#endif + EGrammarNode* cur = &(*g)[tr.GetLHS()]; + cur->is_root = true; + for (int i = 0; i < tr.FLength(); ++i) { + WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER + hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); + cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif + if (sym < 0) + cur = &cur->ntptr[sym]; + else + cur = &cur->tptr[sym]; + } +#ifdef DEBUG_CHART_PARSER + cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif + cur->is_some_rule_complete = true; + cur->input_features = tr.GetFeatureValues(); +} + +EarleyComposer::~EarleyComposer() { + delete pimpl_; +} + +EarleyComposer::EarleyComposer(const FSTNode* fst) { + InitializeConstants(); + pimpl_ = new EarleyComposerImpl(kUNIQUE_START, *fst); +} + +bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { + // first, convert the src forest into an EGrammar + EGrammar g; + const int nedges = src_forest.edges_.size(); + const int nnodes = src_forest.nodes_.size(); + vector cats(nnodes); + bool assign_cats = false; + for (int i = 0; i < nnodes; ++i) + if (assign_cats) { + cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; + } else { + cats[i] = src_forest.nodes_[i].cat_; + } + // construct the grammar + for (int i = 0; i < nedges; ++i) { + const Hypergraph::Edge& edge = src_forest.edges_[i]; + const vector& src = edge.rule_->f(); + EGrammarNode* cur = &g[cats[edge.head_node_]]; + cur->is_root = true; + int ntc = 0; + for (int j = 0; j < src.size(); ++j) { + WordID sym = src[j]; + if (sym <= 0) { + sym = cats[edge.tail_nodes_[ntc]]; + ++ntc; + cur = &cur->ntptr[sym]; + } else { + cur = &cur->tptr[sym]; + } + } + cur->is_some_rule_complete = true; + cur->input_features = edge.feature_values_; + } + EGrammarNode& goal_rule = g[kUNIQUE_START]; + assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || + (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + + return pimpl_->Compose(g, trg_forest); +} + +bool EarleyComposer::Compose(istream* in, Hypergraph* trg_forest) { + EGrammar g; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + AddGrammarRule(line, &g); + } + + return pimpl_->Compose(g, trg_forest); +} diff --git a/decoder/earley_composer.h b/decoder/earley_composer.h new file mode 100644 index 000000000..9f786bf67 --- /dev/null +++ b/decoder/earley_composer.h @@ -0,0 +1,29 @@ +#ifndef _EARLEY_COMPOSER_H_ +#define _EARLEY_COMPOSER_H_ + +#include + +class EarleyComposerImpl; +class FSTNode; +class Hypergraph; + +class EarleyComposer { + public: + ~EarleyComposer(); + EarleyComposer(const FSTNode* phrasetable_root); + bool Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + + // reads the grammar from a file. There must be a single top-level + // S -> X rule. Anything else is possible. Format is: + // [S] ||| [SS,1] + // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 + // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 + // [NP] ||| [DET,1] [N,2] ||| Feature3=2 + // ... + bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: + EarleyComposerImpl* pimpl_; +}; + +#endif diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc new file mode 100644 index 000000000..e78992156 --- /dev/null +++ b/decoder/factored_lexicon_helper.cc @@ -0,0 +1,81 @@ +#include "factored_lexicon_helper.h" + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" + +using namespace std; + +FactoredLexiconHelper::FactoredLexiconHelper() : + kNULL(TD::Convert("")), + has_src_(false), + has_trg_(false) { InitEscape(); } + +FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) : + kNULL(TD::Convert("")), + has_src_(false), + has_trg_(false) { + if (srcfile.size() && srcfile != "*") { + ReadFile rf(srcfile); + has_src_ = true; + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) continue; + vector v; + TD::ConvertSentence(line, &v); + src_.push_back(v); + } + } + if (trgmapfile.size() && trgmapfile != "*") { + ReadFile rf(trgmapfile); + has_trg_ = true; + istream& in = *rf.stream(); + string line; + vector v; + while(in) { + getline(in, line); + if (!in) continue; + SplitOnWhitespace(line, &v); + if (v.size() != 2) { + cerr << "Error reading line in map file: " << line << endl; + abort(); + } + WordID& to = trgmap_[TD::Convert(v[0])]; + if (to != 0) { + cerr << "Duplicate entry for word " << v[0] << endl; + abort(); + } + to = TD::Convert(v[1]); + } + } + InitEscape(); +} + +void FactoredLexiconHelper::InitEscape() { + escape_[TD::Convert("=")] = TD::Convert("__EQ"); + escape_[TD::Convert(";")] = TD::Convert("__SC"); + escape_[TD::Convert(",")] = TD::Convert("__CO"); +} + +void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) { + if (has_src_) { + const int id = smeta.GetSentenceID(); + assert(id < src_.size()); + cur_src_ = src_[id]; + } else { + cur_src_.resize(smeta.GetSourceLength()); + for (int i = 0; i < cur_src_.size(); ++i) { + const vector& arcs = smeta.GetSourceLattice()[i]; + assert(arcs.size() == 1); // only sentences supported for now + cur_src_[i] = arcs[0].label; + } + } + if (cur_src_.size() != smeta.GetSourceLength()) { + cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl; + cerr << " mapped len=" << cur_src_.size() << endl; + cerr << " actual len=" << smeta.GetSourceLength() << endl; + } +} + diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h new file mode 100644 index 000000000..7fedc5176 --- /dev/null +++ b/decoder/factored_lexicon_helper.h @@ -0,0 +1,67 @@ +#ifndef _FACTORED_LEXICON_HELPER_ +#define _FACTORED_LEXICON_HELPER_ + +#include +#include +#include +#include +#include "tdict.h" + +struct SentenceMetadata; + +// when computing features, it can be advantageous to: +// 1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc) +// 2) look at more specific forms (on the source ONLY) +// this class helps you do both by creating a "corpus" view +// should probably add a discussion of why the source can be "refined" by this class +// but not the target. basically, this is because the source is on the right side of +// the conditioning line in the model, and the target is on the left. the most specific +// form must always be generated, but the "source" can include arbitrarily large +// context. +// this currently only works for sentence input to maintain simplicity of the code and +// file formats, but there is no reason why it couldn't work with lattices / CFGs +class FactoredLexiconHelper { + public: + // default constructor does no mapping + FactoredLexiconHelper(); + // Either filename can be empty or * to indicate no mapping + FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile); + + void PrepareForInput(const SentenceMetadata& smeta); + + inline WordID SourceWordAtPosition(const int i) const { + if (i < 0) return kNULL; + assert(i < cur_src_.size()); + return Escape(cur_src_[i]); + } + + inline WordID CoarsenedTargetWordForTarget(const WordID surface_target) const { + if (has_trg_) { + const WordWordMap::const_iterator it = trgmap_.find(surface_target); + if (it == trgmap_.end()) return surface_target; + return Escape(it->second); + } else { + return Escape(surface_target); + } + } + + private: + inline WordID Escape(WordID word) const { + const std::map::const_iterator it = escape_.find(word); + if (it == escape_.end()) return word; + return it->second; + } + + void InitEscape(); + + const WordID kNULL; + bool has_src_; + bool has_trg_; + std::vector > src_; + typedef std::map WordWordMap; + WordWordMap trgmap_; + std::vector cur_src_; + std::map escape_; +}; + +#endif diff --git a/decoder/ff.cc b/decoder/ff.cc new file mode 100644 index 000000000..a6a035b5d --- /dev/null +++ b/decoder/ff.cc @@ -0,0 +1,38 @@ +#include "ff.h" + +#include "tdict.h" +#include "hg.h" + +using namespace std; + +FeatureFunction::~FeatureFunction() {} + +void FeatureFunction::PrepareForInput(const SentenceMetadata&) {} + +void FeatureFunction::FinalTraversalFeatures(const void* /* ant_state */, + SparseVector* /* features */) const {} + +string FeatureFunction::usage_helper(std::string const& name,std::string const& params,std::string const& details,bool sp,bool sd) { + string r=name; + if (sp) { + r+=": "; + r+=params; + } + if (sd) { + r+="\n"; + r+=details; + } + return r; +} + +void FeatureFunction::TraversalFeaturesImpl(const SentenceMetadata&, + const Hypergraph::Edge&, + const std::vector&, + SparseVector*, + SparseVector*, + void*) const { + cerr << "TraversalFeaturesImpl not implemented - override it or TraversalFeaturesLog\n"; + abort(); +} + + diff --git a/decoder/ff.h b/decoder/ff.h new file mode 100644 index 000000000..3280592e8 --- /dev/null +++ b/decoder/ff.h @@ -0,0 +1,82 @@ +#ifndef _FF_H_ +#define _FF_H_ + +#include +#include +#include "sparse_vector.h" + +namespace HG { struct Edge; struct Node; } +class Hypergraph; +class SentenceMetadata; + +// if you want to develop a new feature, inherit from this class and +// override TraversalFeaturesImpl(...). If it's a feature that returns / +// depends on context, you may also need to implement +// FinalTraversalFeatures(...) +class FeatureFunction { + friend class ExternalFeature; + public: + std::string name_; // set by FF factory using usage() + FeatureFunction() : state_size_() {} + explicit FeatureFunction(int state_size) : state_size_(state_size) {} + virtual ~FeatureFunction(); + bool IsStateful() const { return state_size_ > 0; } + int StateSize() const { return state_size_; } + + // override this. not virtual because we want to expose this to factory template for help before creating a FF + static std::string usage(bool show_params,bool show_details) { + return usage_helper("FIXME_feature_needs_name","[no parameters]","[no documentation yet]",show_params,show_details); + } + static std::string usage_helper(std::string const& name,std::string const& params,std::string const& details,bool show_params,bool show_details); + + // called once, per input, before any feature calls to TraversalFeatures, etc. + // used to initialize sentence-specific data structures + virtual void PrepareForInput(const SentenceMetadata& smeta); + + // Compute the feature values and (if this applies) the estimates of the + // feature values when this edge is used incorporated into a larger context + inline void TraversalFeatures(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_state) const { + TraversalFeaturesImpl(smeta, edge, ant_contexts, + features, estimated_features, out_state); + // TODO it's easy for careless feature function developers to overwrite + // the end of their state and clobber someone else's memory. These bugs + // will be horrendously painful to track down. There should be some + // optional strict mode that's enforced here that adds some kind of + // barrier between the blocks reserved for the residual contexts + } + + // if there's some state left when you transition to the goal state, score + // it here. For example, a language model might the cost of adding + // and . + virtual void FinalTraversalFeatures(const void* residual_state, + SparseVector* final_features) const; + + protected: + // context is a pointer to a buffer of size NumBytesContext() that the + // feature function can write its state to. It's up to the feature function + // to determine how much space it needs and to determine how to encode its + // residual contextual information since it is OPAQUE to all clients outside + // of the particular FeatureFunction class. There is one exception: + // equality of the contents (i.e., memcmp) is required to determine whether + // two states can be combined. + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + + // !!! ONLY call this from subclass *CONSTRUCTORS* !!! + void SetStateSize(size_t state_size) { + state_size_ = state_size; + } + private: + int state_size_; +}; + +#endif diff --git a/decoder/ff_basic.cc b/decoder/ff_basic.cc new file mode 100644 index 000000000..f9404d24d --- /dev/null +++ b/decoder/ff_basic.cc @@ -0,0 +1,80 @@ +#include "ff_basic.h" + +#include "fast_lexical_cast.hpp" +#include "hg.h" + +using namespace std; + +// Hiero and Joshua use log_10(e) as the value, so I do to +WordPenalty::WordPenalty(const string& param) : + fid_(FD::Convert("WordPenalty")), + value_(-1.0 / log(10)) { + if (!param.empty()) { + cerr << "Warning WordPenalty ignoring parameter: " << param << endl; + } +} + +void WordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + (void) ant_states; + (void) state; + (void) estimated_features; + features->set_value(fid_, edge.rule_->EWords() * value_); +} + + +SourceWordPenalty::SourceWordPenalty(const string& param) : + fid_(FD::Convert("SourceWordPenalty")), + value_(-1.0 / log(10)) { + if (!param.empty()) { + cerr << "Warning SourceWordPenalty ignoring parameter: " << param << endl; + } +} + +void SourceWordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + (void) ant_states; + (void) state; + (void) estimated_features; + features->set_value(fid_, edge.rule_->FWords() * value_); +} + + +ArityPenalty::ArityPenalty(const std::string& param) : + value_(-1.0 / log(10)) { + string fname = "Arity_"; + unsigned MAX=DEFAULT_MAX_ARITY; + using namespace boost; + if (!param.empty()) + MAX=lexical_cast(param); + for (unsigned i = 0; i <= MAX; ++i) { + WordID fid=FD::Convert(fname+lexical_cast(i)); + fids_.push_back(fid); + } + while (!fids_.empty() && fids_.back()==0) fids_.pop_back(); // pretty up features vector in case FD was frozen. doesn't change anything +} + +void ArityPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + (void) ant_states; + (void) state; + (void) estimated_features; + unsigned a=edge.Arity(); + features->set_value(a& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + const int fid_; + const double value_; +}; + +class SourceWordPenalty : public FeatureFunction { + public: + SourceWordPenalty(const std::string& param); + static std::string usage(bool p,bool d) { + return usage_helper("SourceWordPenalty","","number of source words (local feature, and meaningless except when input has non-constant number of source words, e.g. segmentation/morphology/speech recognition lattice)",p,d); + } + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + const int fid_; + const double value_; +}; + +#define DEFAULT_MAX_ARITY 9 +#define DEFAULT_MAX_ARITY_STRINGIZE(x) #x +#define DEFAULT_MAX_ARITY_STRINGIZE_EVAL(x) DEFAULT_MAX_ARITY_STRINGIZE(x) +#define DEFAULT_MAX_ARITY_STR DEFAULT_MAX_ARITY_STRINGIZE_EVAL(DEFAULT_MAX_ARITY) + +class ArityPenalty : public FeatureFunction { + public: + ArityPenalty(const std::string& param); + static std::string usage(bool p,bool d) { + return usage_helper("ArityPenalty","[MaxArity(default " DEFAULT_MAX_ARITY_STR ")]","Indicator feature Arity_N=1 for rule of arity N (local feature). 0<=N<=MaxArity(default " DEFAULT_MAX_ARITY_STR ")",p,d); + } + + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + std::vector fids_; + const double value_; +}; + +#endif diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc new file mode 100644 index 000000000..a842bba80 --- /dev/null +++ b/decoder/ff_bleu.cc @@ -0,0 +1,289 @@ +namespace { +char const* bleu_usage_name="BLEUModel"; +char const* bleu_usage_short="[-o 3|4]"; +char const* bleu_usage_verbose="Uses feature id 0! Make sure there are no other features whose weights aren't specified or there may be conflicts. Computes oracle with weighted combination of BLEU and model score (from previous model set, using weights on edges?). Performs ngram context expansion; expect reference translation info in sentence metadata; if document scorer is IBM_BLEU_3, then use order 3; otherwise use order 4."; +} + + +#include +#include +#include "fast_lexical_cast.hpp" + +#include + +#include "ff_bleu.h" +#include "tdict.h" +#include "hg.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "scorer.h" + +using namespace std; + +class BLEUModelImpl { + public: + explicit BLEUModelImpl(int order) : + buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), + floor_(-100.0), + kSTART(TD::Convert("")), + kSTOP(TD::Convert("")), + kUNKNOWN(TD::Convert("")), + kNONE(-1), + kSTAR(TD::Convert("<{STAR}>")) {} + + virtual ~BLEUModelImpl() { + } + + inline int StateSize(const void* state) const { + return *(static_cast(state) + state_size_); + } + + inline void SetStateSize(int size, void* state) const { + *(static_cast(state) + state_size_) = size; + } + + void GetRefToNgram() + {} + + string DebugStateToString(const void* state) const { + int len = StateSize(state); + const int* astate = reinterpret_cast(state); + string res = "["; + for (int i = 0; i < len; ++i) { + res += " "; + res += TD::Convert(astate[i]); + } + res += " ]"; + return res; + } + + inline double ProbNoRemnant(int i, int len) { + int edge = len; + bool flag = true; + double sum = 0.0; + while (i >= 0) { + if (buffer_[i] == kSTAR) { + edge = i; + flag = false; + } else if (buffer_[i] <= 0) { + edge = i; + flag = true; + } else { + if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART))) + { //sum += LookupProbForBufferContents(i); + //cerr << "FT"; + CalcPhrase(buffer_[i], &buffer_[i+1]); + } + } + --i; + } + return sum; + } + + double FinalTraversalCost(const void* state) { + int slen = StateSize(state); + int len = slen + 2; + // cerr << "residual len: " << len << endl; + buffer_.resize(len + 1); + buffer_[len] = kNONE; + buffer_[len-1] = kSTART; + const int* astate = reinterpret_cast(state); + int i = len - 2; + for (int j = 0; j < slen; ++j,--i) + buffer_[i] = astate[j]; + buffer_[i] = kSTOP; + assert(i == 0); + return ProbNoRemnant(len - 1, len); + } + + vector CalcPhrase(int word, int* context) { + int i = order_; + vector vs; + int c = 1; + vs.push_back(word); + // while (i > 1 && *context > 0) { + while (*context > 0) { + --i; + vs.push_back(*context); + ++context; + ++c; + } + if(false){ cerr << "VS1( "; + vector::reverse_iterator rit; + for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit ) + cerr << " " << TD::Convert(*rit); + cerr << ")\n";} + + return vs; + } + + + double LookupWords(const TRule& rule, const vector& ant_states, void* vstate, const SentenceMetadata& smeta) { + + int len = rule.ELength() - rule.Arity(); + + for (int i = 0; i < ant_states.size(); ++i) + len += StateSize(ant_states[i]); + buffer_.resize(len + 1); + buffer_[len] = kNONE; + int i = len - 1; + const vector& e = rule.e(); + + /*cerr << "RULE::" << rule.ELength() << " "; + for (vector::const_iterator i = e.begin(); i != e.end(); ++i) + { + const WordID& c = *i; + if(c > 0) cerr << TD::Convert(c) << "--"; + else cerr <<"N--"; + } + cerr << endl; + */ + + for (int j = 0; j < e.size(); ++j) { + if (e[j] < 1) { + const int* astate = reinterpret_cast(ant_states[-e[j]]); + int slen = StateSize(astate); + for (int k = 0; k < slen; ++k) + buffer_[i--] = astate[k]; + } else { + buffer_[i--] = e[j]; + } + } + + double approx_bleu = 0.0; + int* remnant = reinterpret_cast(vstate); + int j = 0; + i = len - 1; + int edge = len; + + + vector vs; + while (i >= 0) { + vs = CalcPhrase(buffer_[i],&buffer_[i+1]); + if (buffer_[i] == kSTAR) { + edge = i; + } else if (edge-i >= order_) { + + vs = CalcPhrase(buffer_[i],&buffer_[i+1]); + + } else if (edge == len && remnant) { + remnant[j++] = buffer_[i]; + } + --i; + } + + //calculate Bvector here + /* cerr << "VS1( "; + vector::reverse_iterator rit; + for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit ) + cerr << " " << TD::Convert(*rit); + cerr << ")\n"; + */ + + ScoreP node_score_p = smeta.GetDocScorer()[smeta.GetSentenceID()]->ScoreCCandidate(vs); + Score *node_score=node_score_p.get(); + string details; + node_score->ScoreDetails(&details); + const Score *base_score= &smeta.GetScore(); + //cerr << "SWBASE : " << base_score->ComputeScore() << details << " "; + + int src_length = smeta.GetSourceLength(); + node_score->PlusPartialEquals(*base_score, rule.EWords(), rule.FWords(), src_length ); + float oracledoc_factor = (src_length + smeta.GetDocLen())/ src_length; + + //how it seems to be done in code + //TODO: might need to reverse the -1/+1 of the oracle/neg examples + //TO VLADIMIR: the polarity would be reversed if you switched error (1-BLEU) for BLEU. + approx_bleu = ( rule.FWords() * oracledoc_factor ) * node_score->ComputeScore(); + //how I thought it was done from the paper + //approx_bleu = ( rule.FWords()+ smeta.GetDocLen() ) * node_score->ComputeScore(); + + if (!remnant){ return approx_bleu;} + + if (edge != len || len >= order_) { + remnant[j++] = kSTAR; + if (order_-1 < edge) edge = order_-1; + for (int i = edge-1; i >= 0; --i) + remnant[j++] = buffer_[i]; + } + + SetStateSize(j, vstate); + //cerr << "Return APPROX_BLEU: " << approx_bleu << " "<< DebugStateToString(vstate) << endl; + return approx_bleu; + } + + static int OrderToStateSize(int order) { + return ((order-1) * 2 + 1) * sizeof(WordID) + 1; + } + + protected: + vector buffer_; + const int order_; + const int state_size_; + const double floor_; + + public: + const WordID kSTART; + const WordID kSTOP; + const WordID kUNKNOWN; + const WordID kNONE; + const WordID kSTAR; +}; + +string BLEUModel::usage(bool param,bool verbose) { + return usage_helper(bleu_usage_name,bleu_usage_short,bleu_usage_verbose,param,verbose); +} + +BLEUModel::BLEUModel(const string& param) : + fid_(0) { //The partial BLEU score is kept in feature id=0 + vector argv; + int argc = SplitOnWhitespace(param, &argv); + int order = 3; + + //loop over argv and load all references into vector of NgramMaps + if (argc >= 1) { + if (argv[0] != "-o" || argc<2) { + cerr<(argv[1]); + } + + SetStateSize(BLEUModelImpl::OrderToStateSize(order)); + pimpl_ = new BLEUModelImpl(order); +} + +BLEUModel::~BLEUModel() { + delete pimpl_; +} + +string BLEUModel::DebugStateToString(const void* state) const{ + return pimpl_->DebugStateToString(state); +} + +void BLEUModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* /* estimated_features */, + void* state) const { + + (void) smeta; + /*cerr << "In BM calling set " << endl; + const Score *s= &smeta.GetScore(); + const int dl = smeta.GetDocLen(); + cerr << "SCO " << s->ComputeScore() << endl; + const DocScorer *ds = &smeta.GetDocScorer(); + */ + +// cerr<< "ff_bleu loading sentence " << smeta.GetSentenceID() << endl; + //} + features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state, smeta)); + //cerr << "FID" << fid_ << " " << DebugStateToString(state) << endl; +} + +void BLEUModel::FinalTraversalFeatures(const void* ant_state, + SparseVector* features) const { + + features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); +} diff --git a/decoder/ff_bleu.h b/decoder/ff_bleu.h new file mode 100644 index 000000000..344dc788d --- /dev/null +++ b/decoder/ff_bleu.h @@ -0,0 +1,32 @@ +#ifndef _BLEU_FF_H_ +#define _BLEU_FF_H_ + +#include +#include + +#include "hg.h" +#include "ff.h" + +class BLEUModelImpl; + +class BLEUModel : public FeatureFunction { + public: + // param = "filename.lm [-o n]" + BLEUModel(const std::string& param); + ~BLEUModel(); + virtual void FinalTraversalFeatures(const void* context, + SparseVector* features) const; + std::string DebugStateToString(const void* state) const; + static std::string usage(bool param,bool verbose); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + const int fid_; + mutable BLEUModelImpl* pimpl_; +}; +#endif diff --git a/decoder/ff_charset.cc b/decoder/ff_charset.cc new file mode 100644 index 000000000..6429088b6 --- /dev/null +++ b/decoder/ff_charset.cc @@ -0,0 +1,44 @@ +#include "ff_charset.h" + +#include "tdict.h" +#include "hg.h" +#include "fdict.h" +#include "stringlib.h" + +using namespace std; + +NonLatinCount::NonLatinCount(const string& param) : FeatureFunction(), fid_(FD::Convert("NonLatinCount")) {} + +bool ContainsNonLatin(const string& word) { + unsigned cur = 0; + while(cur < word.size()) { + const int size = UTF8Len(word[cur]); + if (size > 1) return true; + cur += size; + } + return false; +} + +void NonLatinCount::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + const vector& e = edge.rule_->e(); + int count = 0; + for (int i = 0; i < e.size(); ++i) { + if (e[i] > 0) { + map::iterator it = is_non_latin_.find(e[i]); + if (it == is_non_latin_.end()) { + if ((is_non_latin_[e[i]] = ContainsNonLatin(TD::Convert(e[i])))) + ++count; + } else { + if (it->second) + ++count; + } + } + } + if (count) features->set_value(fid_, count); +} + diff --git a/decoder/ff_charset.h b/decoder/ff_charset.h new file mode 100644 index 000000000..267ef65d0 --- /dev/null +++ b/decoder/ff_charset.h @@ -0,0 +1,26 @@ +#ifndef _FFCHARSET_H_ +#define _FFCHARSET_H_ + +#include +#include +#include "ff.h" +#include "hg.h" + +class SentenceMetadata; + +class NonLatinCount : public FeatureFunction { + public: + NonLatinCount(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + mutable std::map is_non_latin_; + const int fid_; +}; + +#endif diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc new file mode 100644 index 000000000..e56f6f1f9 --- /dev/null +++ b/decoder/ff_context.cc @@ -0,0 +1,231 @@ +#include "ff_context.h" + +#include +#include +#include +#include + +#include "hg.h" +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" +#include "tdict.h" + +RuleContextFeatures::RuleContextFeatures(const string& param) { + // cerr << "initializing RuleContextFeatures with parameters: " << param; + kSOS = TD::Convert(""); + kEOS = TD::Convert(""); + macro_regex = sregex::compile("%([xy])\\[(-[1-9][0-9]*|0|[1-9][1-9]*)]"); + ParseArgs(param); +} + +string RuleContextFeatures::Escape(const string& x) const { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; +} + +// replace %x[relative_location] or %y[relative_location] with actual_token +// within feature_instance +void RuleContextFeatures::ReplaceMacroWithString( + string& feature_instance, bool token_vs_label, int relative_location, + const string& actual_token) const { + + stringstream macro; + if (token_vs_label) { + macro << "%x["; + } else { + macro << "%y["; + } + macro << relative_location << "]"; + int macro_index = feature_instance.find(macro.str()); + if (macro_index == string::npos) { + cerr << "Can't find macro " << macro.str() << " in feature template " + << feature_instance; + abort(); + } + feature_instance.replace(macro_index, macro.str().size(), actual_token); +} + +void RuleContextFeatures::ReplaceTokenMacroWithString( + string& feature_instance, int relative_location, + const string& actual_token) const { + + ReplaceMacroWithString(feature_instance, true, relative_location, + actual_token); +} + +void RuleContextFeatures::ReplaceLabelMacroWithString( + string& feature_instance, int relative_location, + const string& actual_token) const { + + ReplaceMacroWithString(feature_instance, false, relative_location, + actual_token); +} + +void RuleContextFeatures::Error(const string& error_message) const { + cerr << "Error: " << error_message << "\n\n" + + << "RuleContextFeatures Usage: \n" + << " feature_function=RuleContextFeatures -t