diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..a3f979dbf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,221 @@
+example_extff/ff_example.lo
+example_extff/libff_example.la
+mteval/meteor_jar.cc
+training/utils/grammar_convert
+*.a
+*.trs
+*.aux
+*.bbl
+*.blg
+*.dvi
+*.idx
+*.log
+*.o
+*.pdf
+*.ps
+*.pyc
+*.so
+*.toc
+*swp
+*~
+.*
+./cdec/
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache/
+config.guess
+config.h
+config.h.in
+config.h.in~
+config.log
+config.status
+config.sub
+configure
+decoder/Makefile
+decoder/Makefile.in
+decoder/bin/
+decoder/cdec
+decoder/dict_test
+decoder/sv_test
+decoder/ff_test
+decoder/grammar_test
+decoder/hg_test
+decoder/logval_test
+decoder/parser_test
+decoder/rule_lexer.cc
+decoder/small_vector_test
+decoder/trule_test
+decoder/weights_test
+depcomp
+dist
+dpmert/Makefile
+dpmert/Makefile.in
+dpmert/fast_score
+dpmert/lo_test
+dpmert/mr_dpmert_generate_mapper_input
+dpmert/mr_dpmert_map
+dpmert/mr_dpmert_reduce
+dpmert/scorer_test
+dpmert/sentclient
+dpmert/sentserver
+dpmert/union_forests
+dtrain/dtrain
+extools/build_lexical_translation
+extools/extractor
+extools/extractor_monolingual
+extools/featurize_grammar
+extools/filter_grammar
+extools/filter_score_grammar
+extools/mr_stripe_rule_reduce
+extools/score_grammar
+extools/sg_lexer.cc
+extractor/*_test
+extractor/compile
+extractor/extract
+extractor/run_extractor
+gi/clda/src/clda
+gi/markov_al/ml
+gi/pf/align-lexonly
+gi/pf/align-lexonly-pyp
+gi/pf/align-tl
+gi/pf/bayes_lattice_score
+gi/pf/brat
+gi/pf/cbgi
+gi/pf/condnaive
+gi/pf/dpnaive
+gi/pf/itg
+gi/pf/learn_cfg
+gi/pf/nuisance_test
+gi/pf/pf_test
+gi/pf/pfbrat
+gi/pf/pfdist
+gi/pf/pfnaive
+gi/pf/pyp_lm
+gi/posterior-regularisation/prjava/build/
+gi/posterior-regularisation/prjava/lib/*.jar
+gi/posterior-regularisation/prjava/lib/prjava-20100713.jar
+gi/posterior-regularisation/prjava/lib/prjava-20100715.jar
+gi/posterior-regularisation/prjava/prjava.jar
+gi/pyp-topics/src/contexts_lexer.cc
+gi/pyp-topics/src/pyp-contexts-train
+gi/pyp-topics/src/pyp-topics-train
+install-sh
+jam-files/bjam
+jam-files/engine/bin.*
+jam-files/engine/bootstrap/
+klm/lm/bin/
+klm/lm/builder/builder
+klm/lm/builder/lmplz
+klm/lm/build_binary
+klm/lm/ngram_query
+klm/lm/query
+klm/util/bin/
+libtool
+ltmain.sh
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+minrisk/minrisk_optimize
+mira/kbest_mira
+missing
+mteval/bin/
+mteval/fast_score
+mteval/mbr_kbest
+mteval/scorer_test
+phrasinator/gibbs_train_plm
+phrasinator/gibbs_train_plm_notables
+previous.sh
+pro-train/mr_pro_map
+pro-train/mr_pro_reduce
+python/build
+python/setup.py
+rampion/rampion_cccp
+rst_parser/mst_train
+rst_parser/random_tree
+rst_parser/rst_parse
+rst_parser/rst_train
+sa-extract/calignment.c
+sa-extract/cdat.c
+sa-extract/cfloatlist.c
+sa-extract/cintlist.c
+sa-extract/clex.c
+sa-extract/cstrmap.c
+sa-extract/csuf.c
+sa-extract/cveb.c
+sa-extract/lcp.c
+sa-extract/precomputation.c
+sa-extract/rule.c
+sa-extract/rulefactory.c
+sa-extract/sym.c
+stamp-h1
+tests/system_tests/hmm/foo.src
+training/Makefile
+training/Makefile.in
+training/atools
+training/augment_grammar
+training/cllh_filter_grammar
+training/collapse_weights
+training/grammar_convert
+training/lbfgs_test
+training/lbl_model
+training/liblbfgs/bin/
+training/liblbfgs/ll_test
+training/model1
+training/mpi_batch_optimize
+training/mpi_adagrad_optimize
+training/mpi_compute_cllh
+training/mpi_em_optimize
+training/mpi_extract_features
+training/mpi_extract_reachable
+training/mpi_flex_optimize
+training/mpi_online_optimize
+training/mr_em_adapted_reduce
+training/mr_em_map_adapter
+training/mr_optimize_reduce
+training/mr_reduce_to_weights
+training/optimize_test
+training/plftools
+training/test_ngram
+utils/atools
+utils/bin/
+utils/crp_test
+utils/dict_test
+utils/logval_test
+utils/m_test
+utils/mfcr_test
+utils/phmt
+utils/reconstruct_weights
+utils/small_vector_test
+utils/sv_test
+utils/ts
+utils/weights_test
+training/crf/mpi_adagrad_optimize
+training/crf/mpi_batch_optimize
+training/crf/mpi_baum_welch
+training/crf/mpi_compute_cllh
+training/crf/mpi_extract_features
+training/crf/mpi_extract_reachable
+training/crf/mpi_flex_optimize
+training/crf/mpi_online_optimize
+training/dpmert/lo_test
+training/dpmert/mr_dpmert_generate_mapper_input
+training/dpmert/mr_dpmert_map
+training/dpmert/mr_dpmert_reduce
+training/dpmert/sentclient
+training/dpmert/sentserver
+training/dtrain/dtrain
+training/latent_svm/latent_svm
+training/minrisk/minrisk_optimize
+training/mira/kbest_mira
+training/mira/kbest_cut_mira
+training/pro/mr_pro_map
+training/pro/mr_pro_reduce
+training/rampion/rampion_cccp
+training/utils/lbfgs_test
+training/utils/optimize_test
+training/utils/sentclient
+training/utils/sentserver
+word-aligner/fast_align
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..1f0f2eeef
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,23 @@
+language: python
+python:
+ - "2.7"
+before_script:
+ - sudo apt-get install libboost-filesystem1.48-dev
+ - sudo apt-get install libboost-program-options1.48-dev
+ - sudo apt-get install libboost-serialization1.48-dev
+ - sudo apt-get install libboost-regex1.48-dev
+ - sudo apt-get install libboost-test1.48-dev
+ - sudo apt-get install libboost-system1.48-dev
+ - sudo apt-get install libboost-thread1.48-dev
+ - sudo apt-get install flex
+ - autoreconf -ifv
+ - ./configure
+script:
+ - make
+ - cd python
+ - python setup.py install
+ - cd ..
+after_script:
+ - make check
+ - ./tests/run-system-tests.pl
+ - nosetests python/tests
diff --git a/BUILDING b/BUILDING
new file mode 100644
index 000000000..d5a086e89
--- /dev/null
+++ b/BUILDING
@@ -0,0 +1,40 @@
+To build cdec, you'll need:
+
+ * boost headers & boost program_options (you may need to install a package
+ like boost-devel)
+
+
+Instructions for building
+-----------------------------------
+
+ 1) Use automake / autoconf to generate the configure script.
+ I'm not an expert at using these tools, but this should be sufficient:
+
+ autoreconf -ifv
+
+ 2) Configure and build. Your command will look something like this.
+
+ ./configure
+ make
+
+ If you get errors during configure about missing BOOST macros, then step 3
+ failed, and you need to keep working at it. If you get errors during the
+ build, it's probably a problem with step 3 or possibly with some compiler
+ version idiosyncracies (generally, I assume you have a relatively new
+ of g++).
+
+ If you're building on cygwin, their libtool is buggy; this make command
+ works for now:
+
+ make LIBS+="-lz -lboost_program_options" \
+ CFLAGS+="-Wno-sign-compare"
+
+ 3) Test
+
+ ./tests/run-system-tests.pl
+
+ Everything should pass.
+
+
+ 4) Enjoy!
+
diff --git a/LICENSE.cctbx.txt b/LICENSE.cctbx.txt
new file mode 100644
index 000000000..a8d9a4943
--- /dev/null
+++ b/LICENSE.cctbx.txt
@@ -0,0 +1,45 @@
+*** License agreement ***
+
+cctbx Copyright (c) 2006, The Regents of the University of
+California, through Lawrence Berkeley National Laboratory (subject to
+receipt of any required approvals from the U.S. Dept. of Energy). All
+rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+(1) Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+(2) Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+(3) Neither the name of the University of California, Lawrence Berkeley
+National Laboratory, U.S. Dept. of Energy nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You are under no obligation whatsoever to provide any bug fixes,
+patches, or upgrades to the features, functionality or performance of
+the source code ("Enhancements") to anyone; however, if you choose to
+make your Enhancements available either publicly, or directly to
+Lawrence Berkeley National Laboratory, without imposing a separate
+written license agreement for such Enhancements, then you hereby grant
+the following license: a non-exclusive, royalty-free perpetual license
+to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
+
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 000000000..a390938bc
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,213 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+----------------------------------------------
+
+L-BFGS CODE FROM COMPUTATIONAL CRYSTALLOGRAPHY TOOLBOX (CCTBX)
+
+This package includes source code (training/lbfgs.h) based on source
+code distributed as part of the Compational Crystallography Toolbox
+(CCTBX), which has separate copyright notices and license terms. Use of
+this source code is subject to the terms and conditions of the license
+contained in the file LICENSE.cctbx .
+
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 000000000..88327477d
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,24 @@
+# warning - the subdirectories in the following list should
+# be kept in topologically sorted order. Also, DO NOT introduce
+# cyclic dependencies between these directories!
+SUBDIRS = \
+ utils \
+ klm/util/double-conversion \
+ klm/util \
+ klm/util/stream \
+ klm/lm \
+ klm/lm/builder \
+ klm/search \
+ mteval \
+ decoder \
+ training \
+ word-aligner \
+ extractor \
+ example_extff
+
+
+EXTRA_DIST = corpus tests python/cdec python/tests python/examples compound-split environment
+AUTOMAKE_OPTIONS = foreign
+ACLOCAL_AMFLAGS = -I m4
+AM_CPPFLAGS = -D_GLIBCXX_PARALLEL -march=native -mtune=native -O2 -pipe -fomit-frame-pointer -Wall
+
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..3cbc62c36
--- /dev/null
+++ b/README.md
@@ -0,0 +1,46 @@
+`cdec` is a research platform for machine translation and similar structured prediction problems.
+
+[](https://travis-ci.org/redpony/cdec)
+
+## System requirements
+
+- A Linux or Mac OS X system
+- A C++ compiler implementing the [C++-11 standard](http://www.stroustrup.com/C++11FAQ.html) (NEW)
+ - Unfortunately, many systems have compilers that predate C++-11 support.
+ - You may need to build your own C++ compiler or upgrade your operating system's.
+- [Boost C++ libraries (version 1.44 or later)](http://www.boost.org/)
+ - If you build your own boost, you _must install it_ using `bjam install`.
+ - Older versions of Boost _may_ work, but problems have been reported with command line option parsing on some platforms with older versions.
+- [GNU Flex](http://flex.sourceforge.net/)
+
+## Building from a downloaded archive
+
+If your system contains the required tools and libraries in the usual places, you should be able to build as simply as:
+
+ ./configure
+ make -j4
+ ./tests/run-system-tests.pl
+
+## Building from a git clone
+
+In addition to the standard `cdec` third party software requirements, you will additionally need the following software to work with the `cdec` source code directly from git:
+
+- [Autoconf / Automake / Libtool](http://www.gnu.org/software/autoconf/)
+ - Older versions of GNU autotools may not work properly.
+
+Instructions:
+
+ autoreconf -ifv
+ ./configure
+ make -j4
+ ./tests/run-system-tests.pl
+
+## Further information
+
+[For more information, refer to the `cdec` documentation](http://www.cdec-decoder.org)
+
+## Citation
+
+If you make use of cdec, please cite:
+
+C. Dyer, A. Lopez, J. Ganitkevitch, J. Weese, F. Ture, P. Blunsom, H. Setiawan, V. Eidelman, and P. Resnik. cdec: A Decoder, Alignment, and Learning Framework for Finite-State and Context-Free Translation Models. In *Proceedings of ACL*, July, 2010. [[bibtex](http://www.cdec-decoder.org/cdec.bibtex.txt)] [[pdf](http://www.aclweb.org/anthology/P/P10/P10-4002.pdf)]
diff --git a/compound-split/README.md b/compound-split/README.md
new file mode 100644
index 000000000..b7491007a
--- /dev/null
+++ b/compound-split/README.md
@@ -0,0 +1,51 @@
+Instructions for running the compound splitter, which is a reimplementation
+and extension (more features, larger non-word list) of the model described in
+
+ C. Dyer. (2009) Using a maximum entropy model to build segmentation
+ lattices for MT. In Proceedings of NAACL HLT 2009,
+ Boulder, Colorado, June 2009
+
+If you use this software, please cite this paper.
+
+
+GENERATING 1-BEST SEGMENTATIONS AND LATTICES
+------------------------------------------------------------------------------
+
+Here are some sample invokations:
+
+ ./compound-split.pl --output 1best < infile.txt > out.1best.txt
+ Segment infile.txt according to the 1-best segmentation file.
+
+ ./compound-split.pl --output plf < infile.txt > out.plf
+
+ ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf
+ This generates denser lattices than usual (the default beam threshold
+ is 2.2, higher numbers do less pruning)
+
+
+MODEL TRAINING (only for the adventuresome)
+------------------------------------------------------------------------------
+
+I've included some training data for training a German language lattice
+segmentation model, and if you want to explore, you can or change the data.
+If you're especially adventuresome, you can add features to cdec (the current
+feature functions are found in ff_csplit.cc). The training/references are
+in the file:
+
+ dev.in-ref
+
+The format is the unsegmented form on the right and the reference lattice on
+the left, separated by a triple pipe ( ||| ). Note that the segmentation
+model inserts a # as the first word, so your segmentation references must
+include this.
+
+To retrain the model (using MAP estimation of a conditional model), do the
+following:
+
+ cd de
+ ./TRAIN
+
+Note, the optimization objective is supposed to be non-convex, but i haven't
+found much of an effect of where I initialize things. But I haven't looked
+very hard- this might be something to explore.
+
diff --git a/compound-split/cdec-de.ini b/compound-split/cdec-de.ini
new file mode 100644
index 000000000..1573dd522
--- /dev/null
+++ b/compound-split/cdec-de.ini
@@ -0,0 +1,6 @@
+formalism=csplit
+intersection_strategy=full
+weights=de/weights.trained
+#weights=de/weights.noun-only-1best-only
+feature_function=CSplit_BasicFeatures de/large_dict.de.gz de/badlist.de.gz de/wordlist.de
+feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.lm.gz
diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl
new file mode 100755
index 000000000..93ac3b201
--- /dev/null
+++ b/compound-split/compound-split.pl
@@ -0,0 +1,177 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+use Getopt::Long;
+use IPC::Open2;
+
+my $CDEC = "$script_dir/../decoder/cdec";
+my $LANG = 'de';
+
+my $BEAM = 2.1;
+my $OUTPUT = 'plf';
+my $HELP;
+my $VERBOSE;
+my $PRESERVE_CASE;
+
+GetOptions("decoder=s" => \$CDEC,
+ "language=s" => \$LANG,
+ "beam=f" => \$BEAM,
+ "output=s" => \$OUTPUT,
+ "verbose" => \$VERBOSE,
+ "preserve_case" => \$PRESERVE_CASE,
+ "help" => \$HELP
+ ) or usage();
+
+usage() if $HELP;
+
+chdir $script_dir;
+
+if ($VERBOSE) { $VERBOSE = ""; } else { $VERBOSE = " 2> /dev/null"; }
+$LANG = lc $LANG;
+die "Can't find $CDEC\n" unless -f $CDEC;
+die "Can't execute $CDEC\n" unless -x $CDEC;
+die "Don't know about language: $LANG\n" unless -d "./$LANG";
+my $CONFIG="cdec-$LANG.ini";
+die "Can't find $CONFIG" unless -f $CONFIG;
+die "--output must be '1best' or 'plf'\n" unless ($OUTPUT =~ /^(plf|1best)$/);
+check_dependencies($CONFIG, $LANG);
+print STDERR "(Run with --help for options)\n";
+print STDERR "LANGUAGE: $LANG\n";
+print STDERR " OUTPUT: $OUTPUT\n";
+
+my $CMD = "$CDEC -c $CONFIG";
+my $IS_PLF;
+if ($OUTPUT eq 'plf') {
+ $IS_PLF = 1;
+ $CMD .= " --csplit_preserve_full_word --csplit_output_plf --beam_prune $BEAM";
+}
+$CMD .= $VERBOSE;
+
+print STDERR "Executing: $CMD\n";
+
+open2(\*OUT, \*IN, $CMD) or die "Couldn't fork: $!";
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+binmode(IN,":utf8");
+binmode(OUT,":utf8");
+
+while() {
+ chomp;
+ s/^\s+//;
+ s/\s+$//;
+ my @words = split /\s+/;
+ my @res = ();
+ my @todo = ();
+ my @casings = ();
+ for (my $i=0; $i < scalar @words; $i++) {
+ my $word = lc $words[$i];
+ if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) {
+ push @casings, 0;
+ if ($IS_PLF) {
+ push @res, "(('" . escape($word) . "',0,1),),";
+ } else {
+ if ($PRESERVE_CASE) {
+ push @res, $words[$i];
+ } else {
+ push @res, $word;
+ }
+ }
+ } else {
+ push @casings, guess_casing($words[$i]);
+ push @res, undef;
+ push @todo, $word;
+ }
+ }
+ if (scalar @todo > 0) {
+ # print STDERR "TODO: @todo\n";
+ my $tasks = join "\n", @todo;
+ print IN "$tasks\n";
+ for (my $i = 0; $i < scalar @res; $i++) {
+ if (!defined $res[$i]) {
+ my $seg = ;
+ chomp $seg;
+ unless ($IS_PLF) {
+ $seg =~ s/^# //o;
+ }
+ if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); }
+ $res[$i] = $seg;
+ }
+ }
+ }
+ if ($IS_PLF) {
+ print '(';
+ print join '', @res;
+ print ")\n";
+ } else {
+ print "@res\n";
+ }
+}
+
+close IN;
+close OUT;
+
+sub recase_words {
+ my $word = shift;
+ $word =~ s/\b(\w)/\u$1/g;
+ return $word;
+}
+
+sub escape {
+ $_ = shift;
+ s/\\/\\\\/g;
+ s/'/\\'/g;
+ return $_;
+}
+
+sub guess_casing {
+ my $word = shift @_;
+ if (lc($word) eq $word) { return 0; } else { return 1; }
+}
+
+sub usage {
+ print <){
+ chomp;
+ my @x = split /\s+/;
+ for my $f (@x) {
+ push @files, $f if ($f =~ /\.gz$/);
+ }
+ }
+ close F;
+ my $c = 0;
+ for my $file (@files) {
+ $c++ if -f $file;
+ }
+ if ($c != scalar @files) {
+ print STDERR <) {
+ chomp;
+ s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g;
+ $_ = lc $_;
+ my @words = split /\s+/;
+ for my $w (@words) {
+ next if length($w) == 0;
+ $d{$w}++;
+ $z++;
+ }
+}
+my $lz = log($z);
+for my $w (sort {$d{$b} <=> $d{$a}} keys %d) {
+ my $c = $lz-log($d{$w});
+ print "$w $c\n";
+}
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 000000000..6b1287683
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,234 @@
+AC_CONFIG_MACRO_DIR([m4])
+AC_INIT([cdec],[2014-01-28])
+AC_CONFIG_SRCDIR([decoder/cdec.cc])
+AM_INIT_AUTOMAKE
+AC_CONFIG_HEADERS(config.h)
+AC_PROG_LIBTOOL
+AC_PROG_LEX
+case $LEX in
+:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);;
+esac
+OLD_CXXFLAGS=$CXXFLAGS
+AC_PROG_CC
+AC_PROG_CXX
+CXXFLAGS=$OLD_CXXFLAGS
+AX_CXX_COMPILE_STDCXX_11([],[mandatory])
+AC_LANG_CPLUSPLUS
+AC_OPENMP
+BOOST_REQUIRE([1.44])
+BOOST_FILESYSTEM
+BOOST_PROGRAM_OPTIONS
+BOOST_SYSTEM
+BOOST_SERIALIZATION
+BOOST_TEST
+BOOST_THREADS
+AM_PATH_PYTHON
+AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H))
+AC_CHECK_LIB(rt, clock_gettime)
+AC_CHECK_LIB(dl, dlopen)
+AC_CHECK_HEADERS(zlib.h,
+ AC_CHECK_LIB(z, gzread,[
+ AC_DEFINE(HAVE_ZLIB,[],[Do we have zlib])
+ ZLIBS="$ZLIBS -lz"
+ ]))
+
+AC_CHECK_HEADERS(bzlib.h,
+ AC_CHECK_LIB(bz2, BZ2_bzReadOpen,[
+ AC_DEFINE(HAVE_BZLIB,[],[Do we have bzlib])
+ ZLIBS="$ZLIBS -lbz2"
+ ]))
+
+AC_CHECK_HEADERS(lzma.h,
+ AC_CHECK_LIB(lzma, lzma_code,[
+ AC_DEFINE(HAVE_XZLIB,[],[Do we have lzma])
+ ZLIBS="$ZLIBS -llzma"
+ ]))
+
+AC_ARG_ENABLE(mpi,
+ [ --enable-mpi Build MPI binaries, assumes mpi.h is present ],
+ [ mpi=yes
+ ])
+AM_CONDITIONAL([MPI], [test "x$mpi" = xyes])
+
+if test "x$mpi" = xyes
+then
+ AC_DEFINE([HAVE_MPI], [1], [flag for MPI])
+ LIBS="$LIBS -lboost_mpi"
+fi
+
+AM_CONDITIONAL([HAVE_METEOR], false)
+AC_ARG_WITH(meteor,
+ [AC_HELP_STRING([--with-meteor=PATH], [(optional) path to METEOR jar])],
+ [with_meteor=$withval],
+ [with_meteor=no]
+ )
+
+if test "x$with_meteor" != 'xno'
+then
+ AC_CHECK_FILE([$with_meteor],
+ [AC_DEFINE([HAVE_METEOR], [1], [flag for METEOR jar library])],
+ [AC_MSG_ERROR([Cannot find METEOR jar!])])
+ AC_SUBST(METEOR_JAR,"${with_meteor}")
+ AM_CONDITIONAL([HAVE_METEOR], true)
+fi
+
+AM_CONDITIONAL([HAVE_CMPH], false)
+AC_ARG_WITH(cmph,
+ [AC_HELP_STRING([--with-cmph=PATH], [(optional) path to cmph perfect hashing library])],
+ [with_cmph=$withval],
+ [with_cmph=no]
+ )
+
+if test "x$with_cmph" != 'xno'
+then
+ SAVE_CPPFLAGS="$CPPFLAGS"
+ CPPFLAGS="$CPPFLAGS -I${with_cmph}/include"
+
+ AC_CHECK_HEADER(cmph.h,
+ [AC_DEFINE([HAVE_CMPH], [1], [flag for cmph perfect hashing library])],
+ [AC_MSG_ERROR([Cannot find cmph library!])])
+
+ LDFLAGS="$LDFLAGS -L${with_cmph}/lib"
+ AC_CHECK_LIB(cmph, cmph_search)
+ AM_CONDITIONAL([HAVE_CMPH], true)
+fi
+
+AM_CONDITIONAL([HAVE_GTEST], false)
+AC_ARG_WITH(gtest,
+ [AC_HELP_STRING([--with-gtest=DIR], [(optional) path to Google Test library])],
+ [with_gtest=$withval],
+ [with_gtest=no]
+ )
+
+AM_CONDITIONAL([HAVE_GMOCK], false)
+AC_ARG_WITH(gmock,
+ [AC_HELP_STRING([--with-gmock=DIR], [(optional) path to Google Mock library])],
+ [with_gmock=$withval],
+ [with_gmock=no]
+ )
+
+if test "x$with_gtest" != 'xno'
+then
+ gtest_CPPFLAGS="-I${with_gtest}/include"
+ gtest_LDFLAGS="-L${with_gtest} -L${with_gtest}/lib"
+ gtest_LIBS="-lgtest_main -lgtest -lpthread"
+
+ SAVECPP_FLAGS="$CPPFLAGS"
+ CPPFLAGS="$CPPFLAGS $gtest_CPPFLAGS"
+ AC_CHECK_HEADER(${with_gtest}/include/gtest/gtest.h,
+ [AC_DEFINE([HAVE_GTEST], [1], [flag for Google Test header])],
+ [AC_MSG_ERROR([Cannot find Google Test headers!])]
+ )
+
+ SAVE_LDFLAGS="$LDFLAGS"
+ LDFLAGS="$LDFLAGS $gtest_LDFLAGS"
+ SAVE_LIBS="$LIBS"
+ # Google Test needs pthreads.
+ AC_CHECK_LIB([pthread],
+ [pthread_mutex_init],
+ [],
+ [AC_MSG_ERROR([Cannot find pthread library])]
+ )
+ AX_CXX_CHECK_LIB([gtest],
+ [testing::TestInfo::name() const],
+ [],
+ [AC_MSG_ERROR([Cannot find Google Test library libgtest])]
+ )
+ AC_CHECK_LIB([gtest_main],
+ [main],
+ [],
+ [AC_MSG_ERROR([Cannot find Google Test library libgtest_main])]
+ )
+
+ AC_SUBST(AS_TR_CPP([GTEST_CPPFLAGS]), ["$gtest_CPPFLAGS"])
+ AC_SUBST(AS_TR_CPP([GTEST_LDFLAGS]), ["$gtest_LDFLAGS"])
+ AC_SUBST(AS_TR_CPP([GTEST_LIBS]), ["$gtest_LIBS"])
+
+
+ if test "x$with_gmock" != 'xno'
+ then
+ gmock_CPPFLAGS="-I${with_gmock}/include"
+ gmock_LDFLAGS="-L${with_gmock} -L${with_gmock}/lib"
+ gmock_LIBS="-lgmock"
+
+ CPPFLAGS="$CPPFLAGS $gmock_CPPFLAGS"
+ AC_CHECK_HEADER(${with_gmock}/include/gmock/gmock.h,
+ [AC_DEFINE([HAVE_GMOCK], [1], [flag for Google Mock header])],
+ [AC_MSG_ERROR([Cannot find Google Mock headers!])]
+ )
+
+ LDFLAGS="$LDFLAGS $gmock_LDFLAGS"
+ AX_CXX_CHECK_LIB([gmock],
+ [testing::Expectation],
+ [],
+ [AC_MSG_ERROR([Cannot find Google Mock library libgmock])]
+ )
+
+ AC_SUBST(AS_TR_CPP([GMOCK_CPPFLAGS]), ["$gmock_CPPFLAGS"])
+ AC_SUBST(AS_TR_CPP([GMOCK_LDFLAGS]), ["$gmock_LDFLAGS"])
+ AC_SUBST(AS_TR_CPP([GMOCK_LIBS]), ["$gmock_LIBS"])
+ AM_CONDITIONAL([HAVE_GMOCK], true)
+ fi
+
+ CPPFLAGS="$SAVE_CPPFLAGS"
+ LDFLAGS="$SAVE_LDFLAGS"
+ LIBS="$SAVE_LIBS"
+ AM_CONDITIONAL([HAVE_GTEST], true)
+fi
+
+#BOOST_THREADS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS"
+# $BOOST_THREAD_LDFLAGS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $ZLIBS"
+# $BOOST_THREAD_LIBS"
+
+AC_CHECK_HEADER(google/dense_hash_map,
+ [AC_DEFINE([HAVE_SPARSEHASH], [1], [flag for google::dense_hash_map])])
+
+AC_PROG_INSTALL
+
+CPPFLAGS="-DPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6"
+CXXFLAGS="$CXX11_SWITCH $CXXFLAGS -fPIC -g -O3"
+CFLAGS="$CFLAGS -fPIC -g -O3"
+
+if test "x$HAVE_CXX11" = "x0"; then
+ CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP"
+fi
+
+# core cdec stuff
+AC_CONFIG_FILES([Makefile])
+AC_CONFIG_FILES([utils/Makefile])
+AC_CONFIG_FILES([mteval/Makefile])
+AC_CONFIG_FILES([mteval/meteor_jar.cc])
+AC_CONFIG_FILES([decoder/Makefile])
+AC_CONFIG_FILES([python/setup.py])
+AC_CONFIG_FILES([extractor/Makefile])
+AC_CONFIG_FILES([word-aligner/Makefile])
+
+# KenLM stuff
+AC_CONFIG_FILES([klm/util/double-conversion/Makefile])
+AC_CONFIG_FILES([klm/util/stream/Makefile])
+AC_CONFIG_FILES([klm/util/Makefile])
+AC_CONFIG_FILES([klm/lm/Makefile])
+AC_CONFIG_FILES([klm/search/Makefile])
+AC_CONFIG_FILES([klm/lm/builder/Makefile])
+
+# training stuff
+AC_CONFIG_FILES([training/Makefile])
+AC_CONFIG_FILES([training/utils/Makefile])
+AC_CONFIG_FILES([training/liblbfgs/Makefile])
+AC_CONFIG_FILES([training/crf/Makefile])
+AC_CONFIG_FILES([training/dpmert/Makefile])
+AC_CONFIG_FILES([training/pro/Makefile])
+AC_CONFIG_FILES([training/rampion/Makefile])
+AC_CONFIG_FILES([training/minrisk/Makefile])
+AC_CONFIG_FILES([training/mira/Makefile])
+AC_CONFIG_FILES([training/latent_svm/Makefile])
+AC_CONFIG_FILES([training/dtrain/Makefile])
+
+# external feature function example code
+AC_CONFIG_FILES([example_extff/Makefile])
+
+AC_OUTPUT
+
diff --git a/corpus/README.md b/corpus/README.md
new file mode 100644
index 000000000..adc35b849
--- /dev/null
+++ b/corpus/README.md
@@ -0,0 +1,37 @@
+This directory contains a number of useful scripts that are helpful for preprocessing parallel and monolingual corpora. They are provided for convenience and may be very useful, but their functionality will often be supplainted by other, more specialized tools.
+
+Many of these scripts assume that the input is [UTF-8 encoded](http://en.wikipedia.org/wiki/UTF-8).
+
+## Paste parallel files together
+
+This script reads one line at a time from a set of files and concatenates them with a triple pipe separator (`|||`) in the output. This is useful for generating parallel corpora files for training or evaluation:
+
+ ./paste-files.pl file.a file.b file.c [...]
+
+## Punctuation Normalization and Tokenization
+
+This script tokenizes text in any language (well, it does a good job in most languages, and in some it will completely go crazy):
+
+ ./tokenize-anything.sh < input.txt > output.txt
+
+It also normalizes a lot of unicode symbols and even corrects some common encoding errors. It can be applied to monolingual and parallel corpora directly.
+
+## Text lowercasing
+
+This script also does what it says, provided your input is in UTF8:
+
+ ./lowercase.pl < input.txt > output.txt
+
+## Length ratio filtering (for parallel corpora)
+
+This script computes statistics about sentence length ratios in a parallel corpus and removes sentences that are statistical outliers. This tends to remove extremely poorly aligned sentence pairs or sentence pairs that would otherwise be difficult to align:
+
+ ./filter-length.pl input.src-trg > output.src-trg
+
+## Add infrequent self-transaltions to a parallel corpus
+
+This script identifies rare words (those that occur less than 2 times in the corpus) and which have the same orthographic form in both the source and target language. Several copies of these words are then inserted at the end of the corpus that is written, which improves alignment quality.
+
+ ./add-self-translations.pl input.src-trg > output.src-trg
+
+
diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl
new file mode 100755
index 000000000..d707ce29c
--- /dev/null
+++ b/corpus/add-self-translations.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+use strict;
+
+# ADDS SELF-TRANSLATIONS OF POORLY ATTESTED WORDS TO THE PARALLEL DATA
+
+my %df;
+my %def;
+while(<>) {
+# print;
+ chomp;
+ my ($sf, $se) = split / \|\|\| /;
+ die "Format error: $_\n" unless defined $sf && defined $se;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ for my $f (@fs) {
+ $df{$f}++;
+ for my $e (@es) {
+ if ($f eq $e) { $def{$f}++; }
+ }
+ }
+}
+
+for my $k (sort keys %def) {
+ next if $df{$k} > 4;
+ print "$k ||| $k\n";
+ print "$k ||| $k\n";
+ print "$k ||| $k\n";
+}
+
diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl
new file mode 100755
index 000000000..d7608c5ec
--- /dev/null
+++ b/corpus/add-sos-eos.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3);
+my $filec = shift @ARGV;
+my $filea = shift @ARGV;
+my $ofilea = shift @ARGV;
+open C, "<$filec" or die "Can't read $filec: $!";
+if ($filea) {
+ open A, "<$filea" or die "Can't read $filea: $!";
+ open OA, ">$ofilea" or die "Can't write $ofilea: $!";
+}
+binmode(C, ":utf8");
+binmode(STDOUT, ":utf8");
+print STDERR "Adding and markers to input...\n";
+print STDERR " Reading corpus: $filec\n";
+print STDERR " Writing corpus: STDOUT\n";
+print STDERR "Reading alignments: $filea\n" if $filea;
+print STDERR "Writing alignments: $ofilea\n" if $filea;
+
+my $lines = 0;
+while() {
+ $lines++;
+ die "ERROR. Input line $filec:$lines should not contain SGML markup" if /;
+ die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa;
+ chomp $aa;
+ my ($ff, $ee) = @fields;
+ die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee;
+ my @fs = split /\s+/, $ff;
+ my @es = split /\s+/, $ee;
+ my @as = split /\s+/, $aa;
+ my @oas = ();
+ push @oas, '0-0';
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ for my $ap (@as) {
+ my ($a, $b) = split /-/, $ap;
+ die "ERROR. Bad format in: @as" unless defined $a && defined $b;
+ push @oas, ($a + 1) . '-' . ($b + 1);
+ }
+ push @oas, ($flen + 1) . '-' . ($elen + 1);
+ print OA "@oas\n";
+ }
+ print "$o\n";
+}
+if ($filea) {
+ close OA;
+ my $aa = ;
+ die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa;
+}
+print STDERR "\nSUCCESS. Processed $lines lines.\n";
+
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
new file mode 100755
index 000000000..0af3b23ca
--- /dev/null
+++ b/corpus/cut-corpus.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
+
+my $x = shift @ARGV;
+my @ind = split /,/, $x;
+my @o = ();
+for my $ff (@ind) {
+ if ($ff =~ /^\d+$/) {
+ push @o, $ff - 1;
+ } elsif ($ff =~ /^(\d+)-(\d+)$/) {
+ my $a = $1;
+ my $b = $2;
+ die "$a-$b is a bad range in input: $x\n" unless $b > $a;
+ for (my $i=$a; $i <= $b; $i++) {
+ push @o, $i - 1;
+ }
+ } else {
+ die "Bad input: $x\n";
+ }
+}
+
+while(<>) {
+ chomp;
+ my @fields = split /\s*\|\|\|\s*/;
+ my @sf;
+ for my $i (@o) {
+ my $y = $fields[$i];
+ if (!defined $y) { $y= ''; }
+ push @sf, $y;
+ }
+ print join(' ||| ', @sf) . "\n";
+}
+
+
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
new file mode 100755
index 000000000..2e257cdac
--- /dev/null
+++ b/corpus/filter-length.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+##### EDIT THESE SETTINGS ####################################################
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
+my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
+##############################################################################
+
+die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
+if (scalar @ARGV == 2) {
+ my $fp = shift @ARGV;
+ die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/;
+ $MAX_LENGTH=$1;
+}
+
+my $corpus = shift @ARGV;
+
+die "Cannot read from STDIN\n" if $corpus eq '-';
+my $ff = "<$corpus";
+$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
+
+print STDERR "Max line length (monolingual): $MAX_LENGTH\n";
+print STDERR " Parallel corpus: $corpus\n";
+
+open F,$ff or die "Can't read $corpus: $!";
+binmode(F,":utf8");
+
+my $rat_max = log(9);
+my $lrm = 0;
+my $zerof = 0;
+my $zeroe = 0;
+my $bad_format = 0;
+my $absbadrat = 0;
+my $overlene = 0;
+my $overlenf = 0;
+my $lines = 0;
+my @lograts = ();
+while() {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se, @d) = split /\s*\|\|\|\s*/;
+ if (scalar @d != 0 or !defined $se) {
+ $bad_format++;
+ if ($bad_format > 100 && ($bad_format / $lines) > 0.02) {
+ die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_";
+ }
+ next;
+ }
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ if ($flen == 0) {
+ $zerof++;
+ next;
+ }
+ if ($elen == 0) {
+ $zeroe++;
+ next;
+ }
+ if ($flen > $MAX_LENGTH) {
+ $overlenf++;
+ next;
+ }
+ if ($elen > $MAX_LENGTH) {
+ $overlene++;
+ next;
+ }
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ $lrm += $lograt;
+ push @lograts, $lograt;
+ }
+}
+close F;
+
+print STDERR "\nComputing statistics...\n";
+my $lmean = $lrm / scalar @lograts;
+
+my $lsd = 0;
+for my $lr (@lograts) {
+ $lsd += ($lr - $lmean)**2;
+}
+$lsd = sqrt($lsd / scalar @lograts);
+@lograts = ();
+
+my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf + $bad_format;
+my $discard_rate = int(10000 * $pass1_discard / $lines) / 100;
+print STDERR " Total lines: $lines\n";
+print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n";
+print STDERR " Mean F:E ratio: " . exp($lmean) . "\n";
+print STDERR " StdDev F:E ratio: " . exp($lsd) . "\n";
+print STDERR "Writing...\n";
+open F,$ff or die "Can't reread $corpus: $!";
+binmode(F,":utf8");
+my $to = 0;
+my $zviol = 0;
+my $worstz = -1;
+my $worst = "\n";
+$lines = 0;
+while() {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se, @d) = split / \|\|\| /;
+ if (scalar @d != 0 or !defined $se) { next; }
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ next if ($flen == 0);
+ next if ($elen == 0);
+ next if ($flen > $MAX_LENGTH);
+ next if ($elen > $MAX_LENGTH);
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ my $zscore = abs($lograt - $lmean) / $lsd;
+ if ($elen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN &&
+ $flen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && $zscore > $worstz) { $worstz = $zscore; $worst = $_; }
+ if ($zscore > $MAX_ZSCORE) {
+ $zviol++;
+ next;
+ }
+ print;
+ } else {
+ print;
+ }
+ $to++;
+}
+my $discard_rate2 = int(10000 * $zviol / ($lines - $pass1_discard)) / 100;
+print STDERR "\n Lines printed: $to\n Ratio violations: $zviol\t(discard rate = $discard_rate2%)\n";
+print STDERR " Worst z-score: $worstz\n sentence: $worst";
+exit 0;
+
diff --git a/corpus/lowercase.pl b/corpus/lowercase.pl
new file mode 100755
index 000000000..9fd91dac2
--- /dev/null
+++ b/corpus/lowercase.pl
@@ -0,0 +1,9 @@
+#!/usr/bin/perl -w
+use strict;
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+while() {
+ $_ = lc $_;
+ print;
+}
+
diff --git a/corpus/moses-scfg-to-cdec.pl b/corpus/moses-scfg-to-cdec.pl
new file mode 100755
index 000000000..9b8e36179
--- /dev/null
+++ b/corpus/moses-scfg-to-cdec.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+ my ($src, $trg, $feats, $al) = split / \|\|\| /;
+ # [X][NP] von [X][NP] [X] ||| [X][NP] 's [X][NP] [S] ||| 0.00110169 0.0073223 2.84566e-06 0.0027702 0.0121867 2.718 0.606531 ||| 0-0 1-1 2-2 ||| 635 245838 2
+
+ my @srcs = split /\s+/, $src;
+ my @trgs = split /\s+/, $trg;
+ my $lhs = pop @trgs;
+ $lhs =~ s/'/'/g;
+ $lhs =~ s/'/'/g;
+ $lhs =~ s/,/COMMA/g;
+ my $ntc = 0;
+ my $sc = 0;
+ my @of = ();
+ my $x = pop @srcs;
+ my %d = (); # src index to nonterminal count
+ die "Expected [X]" unless $x eq '[X]';
+ my %amap = ();
+ my @als = split / /, $al;
+ for my $st (@als) {
+ my ($s, $t) = split /-/, $st;
+ $amap{$t} = $s;
+ }
+ for my $f (@srcs) {
+ if ($f =~ /^\[X\]\[([^]]+)\]$/) {
+ $ntc++;
+ my $nt = $1;
+ $nt =~ s/'/'/g;
+ $nt =~ s/'/'/g;
+ $nt =~ s/,/COMMA/g;
+ push @of, "[$nt]";
+ $d{$sc} = $ntc;
+ } elsif ($f =~ /^\[[^]]+\]$/) {
+ die "Unexpected $f";
+ } else {
+ push @of, $f;
+ }
+ $sc++;
+ }
+ my @oe = ();
+ my $ind = 0;
+ for my $e (@trgs) {
+ if ($e =~ /^\[X\]\[([^]]+)\]$/) {
+ my $imap = $d{$amap{$ind}};
+ push @oe, "[$imap]";
+ } else {
+ push @oe, $e;
+ }
+ $ind++;
+ }
+ my ($fe, $ef, $j, $lfe, $lef, $dummy, $of) = split / /, $feats;
+ next if $lef eq '0';
+ next if $lfe eq '0';
+ next if $ef eq '0';
+ next if $fe eq '0';
+ next if $j eq '0';
+ next if $of eq '0';
+ $ef = sprintf('%.6g', log($ef));
+ $fe = sprintf('%.6g',log($fe));
+ $j = sprintf('%.6g',log($j));
+ $lef = sprintf('%.6g',log($lef));
+ $lfe = sprintf('%.6g',log($lfe));
+ $of = sprintf('%.6g',log($of));
+ print "$lhs ||| @of ||| @oe ||| RuleCount=1 FgivenE=$fe EgivenF=$ef Joint=$j LexEgivenF=$lef LexFgivenE=$lfe Other=$of\n";
+}
+
+# [X][ADVP] angestiegen [X] ||| rose [X][ADVP] [VP] ||| 0.0538131 0.0097508 0.00744224 0.0249653 0.000698602 2.718 0.606531 ||| 0-1 1-0 ||| 13 94 2
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
new file mode 100755
index 000000000..ef2cd9370
--- /dev/null
+++ b/corpus/paste-files.pl
@@ -0,0 +1,61 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1;
+
+my @fhs = ();
+for my $file (@ARGV) {
+ my $fh;
+ if ($file =~ /\.gz$/) {
+ open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!";
+ } else {
+ open $fh, "<$file" or die "Can't read $file: $!";
+ }
+ binmode($fh,":utf8");
+ push @fhs, $fh;
+}
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $bad = 0;
+my $lc = 0;
+my $done = 0;
+my $fl = 0;
+while(1) {
+ my @line;
+ $lc++;
+ if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; }
+ elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; }
+ my $anum = 0;
+ for my $fh (@fhs) {
+ my $r = <$fh>;
+ if (!defined $r) {
+ die "Mismatched number of lines.\n" if scalar @line > 0;
+ $done = 1;
+ last;
+ }
+ $r =~ s/\r//g;
+ chomp $r;
+ if ($r =~ /\|\|\|/) {
+ $r = '';
+ $bad++;
+ }
+ warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ $r =~ s/\|\|\|/ /g;
+ $r =~ s/\s+/ /g;
+ $r =~ s/^ +//;
+ $r =~ s/ +$//;
+ $anum++;
+ push @line, $r;
+ }
+ last if $done;
+ print STDOUT join(' ||| ', @line) . "\n";
+}
+print STDERR "\n" if $fl;
+for (my $i = 1; $i < scalar @fhs; $i++) {
+ my $fh = $fhs[$i];
+ my $r = <$fh>;
+ die "Mismatched number of lines.\n" if defined $r;
+}
+print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0;
+
diff --git a/corpus/support/README b/corpus/support/README
new file mode 100644
index 000000000..fdbd523e7
--- /dev/null
+++ b/corpus/support/README
@@ -0,0 +1,2 @@
+Run ./tokenize.sh to tokenize text
+Edit eng_token_patterns and eng_token_list to add rules for things not to segment
diff --git a/corpus/support/fix-contract.pl b/corpus/support/fix-contract.pl
new file mode 100755
index 000000000..49e889812
--- /dev/null
+++ b/corpus/support/fix-contract.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl -w
+$|++;
+
+use strict;
+while(<>) {
+ #s/ (pre|anti|re|pro|inter|intra|multi|e|x|neo) - / $1- /ig;
+ #s/ - (year) - (old)/ -$1-$2/ig;
+ s/ ' (s|m|ll|re|d|ve) / '$1 /ig;
+ s/n ' t / n't /ig;
+ print;
+}
+
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl
new file mode 100755
index 000000000..fe03727b2
--- /dev/null
+++ b/corpus/support/fix-eos.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl -w
+$|++;
+
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while() {
+ s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s;
+ print;
+}
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
new file mode 100755
index 000000000..3eee06669
--- /dev/null
+++ b/corpus/support/quote-norm.pl
@@ -0,0 +1,193 @@
+#!/usr/bin/perl -w
+$|++;
+use strict;
+use utf8;
+binmode(STDIN,"utf8");
+binmode(STDOUT,"utf8");
+while() {
+ chomp;
+ $_ = " $_ ";
+
+ # Delete control characters:
+ s/[\x{00}-\x{1f}]//g;
+
+ # PTB --> normal
+ s/-LRB-/(/g;
+ s/-RRB-/)/g;
+ s/-LSB-/[/g;
+ s/-RSB-/]/g;
+ s/-LCB-/{/g;
+ s/-RCB-/}/g;
+ s/ gon na / gonna /g;
+
+ # Regularize named HTML/XML escapes:
+ s/&\s*lt\s*;//gi; # HTML closing angle bracket
+ s/&\s*squot\s*;/'/gi; # HTML single quote
+ s/&\s*quot\s*;/"/gi; # HTML double quote
+ s/&\s*nbsp\s*;/ /gi; # HTML non-breaking space
+ s/'/\'/g; # HTML apostrophe
+ s/&\s*amp\s*;/&/gi; # HTML ampersand (last)
+
+ # Regularize known HTML numeric codes:
+ s/&\s*#\s*160\s*;/ /gi; # no-break space
+ s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
+ s/&\s*#45\s*;/--/g; # hyphen-minus
+
+ # Convert arbitrary hex or decimal HTML entities to actual characters:
+ s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+ s/&\#([0-9]+);/pack("U", $1)/ge;
+
+ # Regularlize spaces:
+ s/\x{ad}//g; # soft hyphen
+ s/\x{200C}//g; # zero-width non-joiner
+ s/\x{a0}/ /g; # non-breaking space
+ s/\x{2009}/ /g; # thin space
+ s/\x{2028}/ /g; # "line separator"
+ s/\x{2029}/ /g; # "paragraph separator"
+ s/\x{202a}/ /g; # "left-to-right embedding"
+ s/\x{202b}/ /g; # "right-to-left embedding"
+ s/\x{202c}/ /g; # "pop directional formatting"
+ s/\x{202d}/ /g; # "left-to-right override"
+ s/\x{202e}/ /g; # "right-to-left override"
+ s/\x{85}/ /g; # "next line"
+ s/\x{fffd}/ /g; # "replacement character"
+ s/\x{feff}/ /g; # byte-order mark
+ s/\x{fdd3}/ /g; # "unicode non-character"
+
+ # Convert other Windows 1252 characters to UTF-8
+ s/\x{80}/\x{20ac}/g; # euro sign
+ s/\x{95}/\x{2022}/g; # bullet
+ s/\x{99}/\x{2122}/g; # trademark sign
+
+ # Currency and measure conversions:
+ s/ (\d\d): (\d\d)/ $1:$2/g;
+ s/[\x{20a0}]\x{20ac}]/ EUR /g;
+ s/[\x{00A3}]/ GBP /g;
+ s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
+ s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
+
+ # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8:
+ s/�c/--/g; # long dash
+ s/\x{e2}\x{20ac}oe/\"/g; # opening double quote
+ s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote
+ s/\x{e2}\x{20ac}\x{9d}/\"/g; # closing double quote
+ s/\x{e2}\x{20ac}\x{2122}/\'/g; # apostrophe
+ s/\x{e2}\x{20ac}\x{201c}/ -- /g; # en dash?
+ s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash?
+ s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote?
+ s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote?
+ s/\x{c3}\x{9f}/\x{df}/g; # esset
+ s/\x{c3}\x{0178}/\x{df}/g; # esset
+ s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut
+ s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut
+ s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut
+ s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this
+ s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this
+ s/\x{c3}\x{96}/\x{d6}/g; # O umlaut
+ s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut
+ s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut
+ s/\x{80}/\x{20ac}/g; # euro sign
+ s/\x{95}/\x{2022}/g; # bullet
+ s/\x{99}/\x{2122}/g; # trademark sign
+
+ # Regularize quotes:
+ s/ˇ/'/g; # caron
+ s/´/'/g; # acute accent
+ s/`/'/g; # grave accent
+ s/ˉ/'/g; # modified letter macron
+ s/ ,,/ "/g; # ghetto low-99 quote
+ s/``/"/g; # latex-style left quote
+ s/''/"/g; # latex-style right quote
+ s/\x{300c}/"/g; # left corner bracket
+ s/\x{300d}/"/g; # right corner bracket
+ s/\x{3003}/"/g; # ditto mark
+ s/\x{00a8}/"/g; # diaeresis
+ s/\x{92}/\'/g; # curly apostrophe
+ s/\x{2019}/\'/g; # curly apostrophe
+ s/\x{f03d}/\'/g; # curly apostrophe
+ s/\x{b4}/\'/g; # curly apostrophe
+ s/\x{2018}/\'/g; # curly single open quote
+ s/\x{201a}/\'/g; # low-9 quote
+ s/\x{93}/\"/g; # curly left quote
+ s/\x{201c}/\"/g; # curly left quote
+ s/\x{94}/\"/g; # curly right quote
+ s/\x{201d}/\"/g; # curly right quote
+ s/\x{2033}/\"/g; # curly right quote
+ s/\x{201e}/\"/g; # low-99 quote
+ s/\x{84}/\"/g; # low-99 quote (bad enc)
+ s/\x{201f}/\"/g; # high-rev-99 quote
+ s/\x{ab}/\"/g; # opening guillemet
+ s/\x{bb}/\"/g; # closing guillemet
+ s/\x{0301}/'/g; # combining acute accent
+ s/\x{203a}/\"/g; # angle quotation mark
+ s/\x{2039}/\"/g; # angle quotation mark
+
+ # Space inverted punctuation:
+ s/¡/ ¡ /g;
+ s/¿/ ¿ /g;
+
+ # Russian abbreviations:
+ s/ п. п. / п.п. /g;
+ s/ ст. л. / ст.л. /g;
+ s/ т. е. / т.е. /g;
+ s/ т. к. / т.к. /g;
+ s/ т. ч. / т.ч. /g;
+ s/ т. д. / т.д. /g;
+ s/ т. п. / т.п. /g;
+ s/ и. о. / и.о. /g;
+ s/ с. г. / с.г. /g;
+ s/ г. р. / г.р. /g;
+ s/ т. н. / т.н. /g;
+ s/ т. ч. / т.ч. /g;
+ s/ н. э. / н.э. /g;
+
+ # Convert foreign numerals into Arabic numerals
+ tr/०-९/0-9/; # devangari
+ tr/౦-౯/0-9/; # telugu
+ tr/೦-೯/0-9/; # kannada
+ tr/೦-௯/0-9/; # tamil
+ tr/൦-൯/0-9/; # malayalam
+
+ # Random punctuation:
+ tr/!-~/!-~/;
+ s/、/,/g;
+ # s/。/./g;
+ s/\x{85}/.../g;
+ s/…/.../g;
+ s/―/--/g;
+ s/–/--/g;
+ s/─/--/g;
+ s/—/--/g;
+ s/\x{97}/--/g;
+ s/•/ * /g;
+ s/\*/ * /g;
+ s/،/,/g;
+ s/؟/?/g;
+ s/ـ/ /g;
+ s/Ã ̄/i/g;
+ s/’/'/g;
+ s/â€"/"/g;
+ s/؛/;/g;
+
+ # Regularize ligatures:
+ s/\x{9c}/oe/g; # "oe" ligature
+ s/\x{0153}/oe/g; # "oe" ligature
+ s/\x{8c}/Oe/g; # "OE" ligature
+ s/\x{0152}/Oe/g; # "OE" ligature
+ s/\x{fb00}/ff/g; # "ff" ligature
+ s/\x{fb01}/fi/g; # "fi" ligature
+ s/\x{fb02}/fl/g; # "fl" ligature
+ s/\x{fb03}/ffi/g; # "ffi" ligature
+ s/\x{fb04}/ffi/g; # "ffl" ligature
+
+ s/β/ß/g; # WMT 2010 error
+
+ # Strip extra spaces:
+ s/\s+/ /g;
+ s/^\s+//;
+ s/\s+$//;
+
+ print "$_\n";
+}
+
diff --git a/corpus/support/token_list b/corpus/support/token_list
new file mode 100644
index 000000000..d38638cfd
--- /dev/null
+++ b/corpus/support/token_list
@@ -0,0 +1,509 @@
+##################### hyphenated words added by Fei since 3/7/05
+##X-ray
+
+# hindi abbreviation patterns
+जन.
+फर.
+अग.
+सित.
+अक्टू.
+अक्तू.
+नव.
+दिस.
+डी.एल.
+डी.टी.ओ.
+डी.ए.
+ए.एस.आई.
+डी.टी.ओ.
+एम.एस.आर.टी.सी.
+बी.बी.एम.बी.
+डी.एस.पी.
+सी.आर.पी.
+एस.डी.एम.
+सी.डी.पी.ओ.
+बी.डी.ओ.
+एस.डी.ओ.
+एम.पी.पी.
+पी.एच.ई.
+एस.एच.ओ.
+ए.सी.पी.
+यू.पी.
+पी.एम.
+आर.बी.डी.
+वी.पी.
+सी.ए.डी.पी.
+ए.
+बी.
+सी.
+डी.
+ई.
+एफ.
+जी.
+एच.
+आई.
+जे.
+के.
+एल.
+एम.
+एन.
+ओ.
+पी.
+क़यू.
+आर.
+एस.
+टी.
+यू.
+वी.
+डबल्यू.
+एक्स.
+वाई.
+ज़ेड.
+ज़ी.
+
+##################### words made of punct only
+:-
+:-)
+:-(
++=
+-=
+.=
+*=
+>=
+<=
+==
+&&
+||
+=>
+->
+<-
+:)
+:(
+;)
+
+#################### abbr added by Fei
+oz.
+fl.
+tel.
+1.
+2.
+3.
+4.
+5.
+6.
+7.
+8.
+9.
+10.
+
+##################### abbreviation: words that contain period.
+EE.UU.
+ee.uu.
+U.A.E
+Ala.
+Ph.D.
+min.
+max.
+z.B.
+d.h.
+ggf.
+ca.
+bzw.
+bzgl.
+Eng.
+i.e.
+a.m.
+am.
+A.M.
+Apr.
+Ariz.
+Ark.
+Aug.
+B.A.T.
+B.A.T
+Calif.
+Co.
+Conn.
+Corp.
+Cos.
+D.C.
+Dec.
+Dept.
+Dr.
+Drs.
+Feb.
+Fla.
+Fri.
+Ga.
+Gen.
+gen.
+GEN.
+Gov.
+Govt.
+Ill.
+Inc.
+Jan.
+Jr.
+Jul.
+Jun.
+Kan.
+L.A.
+Lieut.
+Lt.
+Ltd.
+Ma.
+Mar.
+Mass.
+Md.
+Mfg.
+Mgr.
+Mio.
+Mrd.
+Bio.
+Minn.
+Mo.
+Mon.
+Mr.
+Mrs.
+Ms.
+Mt.
+N.D.
+Neb.
+Nev.
+No.
+Nos.
+Nov.
+Oct.
+Okla.
+Op.
+Ore.
+Pa.
+p.m
+p.m.
+I.B.C.
+N.T.V
+Pres.
+Prof.
+Prop.
+Rd.
+Rev.
+R.J.
+C.L
+Rs.
+Rte.
+Sat.
+W.T
+Sen.
+Sep.
+Sept.
+Sgt.
+Sr.
+SR.
+St.
+Ste.
+Sun.
+Tenn.
+Tex.
+Thu.
+Tue.
+Univ.
+Va.
+Vt.
+Wed.
+approx.
+dept.
+e.g.
+E.G.
+eg.
+est.
+etc.
+ex.
+ext.
+ft.
+hon.
+hr.
+hrs.
+lab.
+lb.
+lbs.
+mass.
+misc.
+no.
+nos.
+nt.
+para.
+paras.
+pct.
+prod.
+rec.
+ref.
+rel.
+rep.
+sq.
+st.
+stg.
+vol.
+vs.
+U.S.
+J.S.
+U.N.
+u.n.
+A.
+B.
+C.
+D.
+E.
+F.
+G.
+H.
+I.
+J.
+K.
+L.
+M.
+N.
+O.
+P.
+Q.
+R.
+S.
+T.
+U.
+V.
+W.
+X.
+Y.
+Z.
+А.
+Б.
+В.
+Г.
+Д.
+Е.
+Ё.
+Ж.
+З.
+И.
+Й.
+К.
+Л.
+М.
+Н.
+О.
+П.
+Р.
+С.
+Т.
+У.
+Ф.
+Х.
+Ц.
+Ч.
+Ш.
+Щ.
+Ъ.
+Ы.
+Ь.
+Э.
+Ю.
+Я.
+л.
+г.
+обл.
+гг.
+в.
+вв.
+мин.
+ч.
+тыс.
+млн.
+млрд.
+трлн.
+кв.
+куб.
+руб.
+коп.
+долл.
+Прим.
+прим.
+чел.
+грн.
+мин.
+им.
+проф.
+акад.
+ред.
+авт.
+корр.
+соб.
+спец.
+см.
+тж.
+др.
+пр.
+букв.
+# Two-letter abbreviations - can be written with space
+п.п.
+ст.л.
+т.е.
+т.к.
+т.ч.
+т.д.
+т.п.
+и.о.
+с.г.
+г.р.
+т.н.
+т.ч.
+н.э.
+# Swahili
+A.D.
+Afr.
+A.G.
+agh.
+A.H.
+A.M.
+a.s.
+B.A.
+B.C.
+Bi.
+B.J.
+B.K.
+B.O.M.
+Brig.
+Bro.
+bt.
+bw.
+Bw.
+Cap.
+C.C.
+cCM.
+C.I.A.
+cit.
+C.M.S.
+Co.
+Corp.
+C.S.Sp.
+C.W.
+D.C.
+Dk.
+Dkt.
+Dk.B.
+Dr.
+E.C.
+e.g.
+E.M.
+E.n.
+etc.
+Feb.
+F.F.U.
+F.M.
+Fr.
+F.W.
+I.C.O.
+i.e.
+I.L.C.
+Inc.
+Jan.
+J.F.
+Jr.
+J.S.
+J.V.W.A.
+K.A.R.
+K.A.U.
+K.C.M.C.
+K.k.
+K.K.
+k.m.
+km.
+K.m.
+K.N.C.U.
+K.O.
+K.S.
+Ksh.
+kt.
+kumb.
+k.v.
+kv.
+L.G.
+ltd.
+Ltd.
+M.A.
+M.D.
+mf.
+Mh.
+Mhe.
+mil.
+m.m.
+M.m.
+Mm.
+M.M.
+Mr.
+Mrs.
+M.S.
+Mt.
+Mw.
+M.W.
+Mwl.
+na.
+Na.
+N.F.
+N.J.
+n.k.
+nk.
+n.k.w.
+N.N.
+Nov.
+O.C.D.
+op.
+P.C.
+Phd.
+Ph.D.
+P.J.
+P.o.
+P.O.
+P.O.P.
+P.P.F.
+Prof.
+P.s.
+P.S.
+Q.C.
+Rd.
+s.a.w.
+S.A.W.
+S.D.
+Sept.
+sh.
+Sh.
+SH.
+shs.
+Shs.
+S.J.
+S.L.
+S.L.P.
+S.s.
+S.S.
+St.
+s.w.
+s.w.T.
+taz.
+Taz.
+T.C.
+T.E.C.
+T.L.P.
+T.O.H.S.
+Tsh.
+T.V.
+tz.
+uk.
+Uk.
+U.M.C.A.
+U.N.
+U.S.
+Ush.
+U.W.T.
+Viii.
+Vol.
+V.T.C.
+W.H.
+yamb.
+Y.M.C.A.
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
new file mode 100644
index 000000000..de64fb2a1
--- /dev/null
+++ b/corpus/support/token_patterns
@@ -0,0 +1,5 @@
+/^(al|el|ul|e)\-[a-z]+$/
+/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/
+/^\p{Cyrillic}\.\p{Cyrillic}\.$/
+/^(\d|\d\d|\d\d\d)\.$/
+
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
new file mode 100755
index 000000000..f57bc87a9
--- /dev/null
+++ b/corpus/support/tokenizer.pl
@@ -0,0 +1,709 @@
+#!/usr/bin/env perl
+$|++;
+
+my $script_dir;
+BEGIN {$^W = 1; use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+
+use strict;
+use utf8;
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $debug = 0;
+
+
+############ options:
+### for all options:
+### 0 means no split on that symbol
+### 1 means split on that symbol in all cases.
+### 2 means do not split in condition 1.
+### n means do not split in any of the conditions in the set {1, 2, ..., n-1}.
+
+
+### prefix
+## for "#": #90
+my $Split_On_SharpSign = 2; # 2: do not split on Num, e.g., "#90"
+
+
+############## "infix"
+my $Split_On_Tilde = 2; # 2: do not split on Num, e.g., "12~13".
+
+my $Split_On_Circ = 2; # 2: do not split on Num, e.g, "2^3"
+
+## for "&"
+my $Split_On_AndSign = 2; # 2: do not split on short Name, e.g., "AT&T".
+
+## for hyphen: 1990-1992
+my $Split_On_Dash = 2; ## 2: do not split on number, e.g., "22-23".
+my $Split_On_Underscore = 0; ## 0: do not split by underline
+
+## for ":": 5:4
+my $Split_On_Semicolon = 2; ## 2: don't split for num, e.g., "5:4"
+
+########### suffix
+## for percent sign: 5%
+my $Split_On_PercentSign = 1; ## 2: don't split num, e.g., 5%
+
+############# others
+## for slash: 1/4
+my $Split_On_Slash = 2; ## 2: don't split on number, e.g., 1/4.
+my $Split_On_BackSlash = 0; ## 0: do not split on "\", e.g., \t
+
+### for "$": US$120
+my $Split_On_DollarSign = 2; ### 2: US$120 => "US$ 120"
+ ### 1: US$120 => "US $ 120"
+## for 's etc.
+my $Split_NAposT = 1; ## n't
+my $Split_AposS = 1; ## 's
+my $Split_AposM = 1; ## 'm
+my $Split_AposRE = 1; ## 're
+my $Split_AposVE = 1; ## 've
+my $Split_AposLL = 1; ## 'll
+my $Split_AposD = 1; ## 'd
+
+
+### some patterns
+my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-';
+
+#### step 1: read files
+
+my $workdir = $script_dir;
+my $dict_file = "$workdir/token_list";
+my $word_patt_file = "$workdir/token_patterns";
+
+open(my $dict_fp, "$dict_file") or die;
+binmode($dict_fp, ":utf8");
+
+# read in the list of words that should not be segmented,
+## e.g.,"I.B.M.", co-operation.
+my %dict_hash = ();
+my $dict_entry = 0;
+while(<$dict_fp>){
+ chomp;
+ next if /^\s*$/;
+ s/^\s+//;
+ s/\s+$//;
+ tr/A-Z/a-z/;
+ $dict_hash{$_} = 1;
+ $dict_entry ++;
+}
+
+open(my $patt_fp, "$word_patt_file") or die;
+binmode($patt_fp, ":utf8");
+my @word_patts = ();
+my $word_patt_num = 0;
+while(<$patt_fp>){
+ chomp;
+ next if /^\s*$/;
+ s/^\s+//;
+ s/\s+$//;
+ s/^\/(.+)\/$/$1/; # remove / / around the pattern
+ push(@word_patts, $_);
+ $word_patt_num ++;
+}
+
+
+###### step 2: process the input file
+my $orig_token_total = 0;
+my $deep_proc_token_total = 0;
+my $new_token_total = 0;
+
+while(){
+ chomp();
+ s/\x{0970}/./g; # dev abbreviation character
+ if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^//;
+ $new_line =~ s/\s*<\s+(p|hl)\s+>/<$1>/;
+ $new_line =~ s/\s*<\s+\/\s+(p|hl|DOC)\s+>/<\/$1>/;
+ $new_line =~ s/<\s+\/\s+seg\s+>/<\/seg>/;
+ if ($new_line =~ /^\s*<\s+DOC\s+/) {
+ $new_line =~ s/\s+//g;
+ $new_line =~ s/DOC/DOC /;
+ $new_line =~ s/sys/ sys/;
+ }
+ if ($new_line =~ /^\s*<\s+(refset|srcset)\s+/) {
+ $new_line =~ s/\s+//g;
+ $new_line =~ s/(set|src|tgt|trg)/ $1/g;
+ }
+
+ chomp $new_line;
+ print STDOUT "$new_line\n";
+}
+
+########################################################################
+
+### tokenize a line.
+sub proc_line {
+ my @params = @_;
+ my $param_num = scalar @params;
+
+ if(($param_num < 1) || ($param_num > 3)){
+ die "wrong number of params for proc_line: $param_num\n";
+ }
+
+ my $orig_line = $params[0];
+
+ $orig_line =~ s/^\s+//;
+ $orig_line =~ s/\s+$//;
+
+ my @parts = split(/\s+/, $orig_line);
+
+ if($param_num >= 2){
+ my $orig_num_ptr = $params[1];
+ $$orig_num_ptr = scalar @parts;
+ }
+
+ my $new_line = "";
+
+ my $deep_proc_token = 0;
+ foreach my $part (@parts){
+ my $flag = -1;
+ $new_line .= proc_token($part, \$flag) . " ";
+ $deep_proc_token += $flag;
+ }
+
+ if($param_num == 3){
+ my $deep_num_ptr = $params[2];
+ $$deep_num_ptr = $deep_proc_token;
+ }
+
+ return $new_line;
+}
+
+
+
+## Tokenize a str that does not contain " ", return the new string
+## The function handles the cases that the token needs not be segmented.
+## for other cases, it calls deep_proc_token()
+sub proc_token {
+ my @params = @_;
+ my $param_num = scalar @params;
+ if($param_num > 2){
+ die "proc_token: wrong number of params: $param_num\n";
+ }
+
+ my $token = $params[0];
+
+ if(!defined($token)){
+ return "";
+ }
+
+ my $deep_proc_flag;
+
+ if($param_num == 2){
+ $deep_proc_flag = $params[1];
+ $$deep_proc_flag = 0;
+ }
+
+ if($debug){
+ print STDERR "pro_token:+$token+\n";
+ }
+
+ ### step 0: it has only one char
+ if(($token eq "") || ($token=~ /^.$/)){
+ ## print STDERR "see +$token+\n";
+ return $token;
+ }
+
+ ## step 1: check the most common case
+ if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
+ #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
+ ### most common cases
+ return $token;
+ }
+
+ ## step 2: check whether it is some NE entity
+ ### 1.2.4.6
+ if($token =~ /^\d+(.\d+)+$/){
+ return $token;
+ }
+
+ if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){
+ return $token;
+ }
+
+ ## 1,234,345.34
+ if($token =~ /^\d+(\.\d{3})*,\d+$/){
+ ## number
+ return $token;
+ }
+ if($token =~ /^\d+(,\d{3})*\.\d+$/){
+ ## number
+ return $token;
+ }
+ if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+.*$/){
+ ## twitter hashtag or address
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^[a-z0-9\_\-]+\@[a-z\d\_\-]+(\.[a-z\d\_\-]+)*(.*)$/i){
+ ### email address: xxx@yy.zz
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(mailto|http|https|ftp|gopher|telnet|file)\:\/{0,2}([^\.]+)(\.(.+))*$/i){
+ ### URL: http://xx.yy.zz
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(www)(\.(.+))+$/i){
+ ### www.yy.dd/land/
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(\w+\.)+(com|co|edu|org|gov|ly|cz|ru|eu)(\.[a-z]{2,3})?\:{0,2}(\/\S*)?$/i){
+ ### URL: upenn.edu/~xx
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^\(\d{3}\)\d{3}(\-\d{4})($common_right_punc)*$/){
+ ## only handle American phone numbers: e.g., (914)244-4567
+ return proc_rightpunc($token);
+ }
+
+ #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]';
+ my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]';
+ if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){
+ ### /nls/p/....
+ return $token;
+ }
+
+ if($token =~ /^\\(($t1)+\\)+($t1)+\\?$/i){
+ ### \nls\p\....
+ return $token;
+ }
+
+ ## step 3: check the dictionary
+ my $token_lc = $token;
+ $token_lc =~ tr/A-Z/a-z/;
+
+ if(defined($dict_hash{$token_lc})){
+ return $token;
+ }
+
+ ## step 4: check word_patterns
+ my $i=1;
+ foreach my $patt (@word_patts){
+ if($token_lc =~ /$patt/){
+ if($debug){
+ print STDERR "+$token+ match pattern $i: +$patt+\n";
+ }
+ return $token;
+ }else{
+ $i++;
+ }
+ }
+
+ ## step 5: call deep tokenization
+ if($param_num == 2){
+ $$deep_proc_flag = 1;
+ }
+ return deep_proc_token($token);
+}
+
+
+### remove punct on the right side
+### e.g., xxx@yy.zz, => xxx@yy.zz ,
+sub proc_rightpunc {
+ my ($token) = @_;
+
+ $token =~ s/(($common_right_punc)+)$/ $1 /;
+ if($token =~ /\s/){
+ return proc_line($token);
+ }else{
+ return $token;
+ }
+}
+
+
+
+#######################################
+### return the new token:
+### types of punct:
+## T1 (2): the punct is always a token by itself no matter where it
+### appears: " ;
+## T2 (15): the punct that can be a part of words made of puncts only.
+## ` ! @ + = [ ] ( ) { } | < > ?
+## T3 (15): the punct can be part of a word that contains [a-z\d]
+## T3: ~ ^ & : , # * % - _ \ / . $ '
+## infix: ~ (12~13), ^ (2^3), & (AT&T), : ,
+## prefix: # (#9), * (*3),
+## suffix: % (10%),
+## infix+prefix: - (-5), _ (_foo),
+## more than one position: \ / . $
+## Appos: 'm n't ...
+
+## 1. separate by puncts in T1
+## 2. separate by puncts in T2
+## 3. deal with punct T3 one by one according to options
+## 4. if the token remains unchanged after step 1-3, return the token
+
+## $line contains at least 2 chars, and no space.
+sub deep_proc_token {
+ my ($line) = @_;
+ if($debug){
+ print STDERR "deep_proc_token: +$line+\n";
+ }
+
+ ##### step 0: if it mades up of all puncts, remove one punct at a time.
+ if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
+ if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){
+ ## ++ @@@@ !!! ....
+ return $line;
+ }
+
+ if($line =~ /^(.)(.+)$/){
+ my $t1 = $1;
+ my $t2 = $2;
+ return $t1 . " " . proc_token($t2);
+ }else{
+ ### one char only
+ print STDERR "deep_proc_token: this should not happen: +$line+\n";
+ return $line;
+ }
+ }
+
+ ##### step 1: separate by punct T2 on the boundary
+ my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;';
+ if($line =~ s/^(($t2)+)/$1 /){
+ return proc_line($line);
+ }
+
+ if($line =~ s/(($t2)+)$/ $1/){
+ return proc_line($line);
+ }
+
+ ## step 2: separate by punct T2 in any position
+ if($line =~ s/(($t2)+)/ $1 /g){
+ return proc_line($line);
+ }
+
+ ##### step 3: deal with special puncts in T3.
+ if($line =~ /^(\,+)(.+)$/){
+ my $t1 = $1;
+ my $t2 = $2;
+ return proc_token($t1) . " " . proc_token($t2);
+ }
+
+ if($line =~ /^(.*[^\,]+)(\,+)$/){
+ ## 19.3,,, => 19.3 ,,,
+ my $t1 = $1;
+ my $t2 = $2;
+ return proc_token($t1) . " " . proc_token($t2);
+ }
+
+ ## remove the ending periods that follow number etc.
+ if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){
+ ## 12~13. => 12~13 .
+ my $t1 = $1;
+ my $t3 = $3;
+ return proc_token($t1) . " " . proc_token($t3);
+ }
+
+ ### deal with "$"
+ if(($line =~ /\$/) && ($Split_On_DollarSign > 0)){
+ my $suc = 0;
+ if($Split_On_DollarSign == 1){
+ ## split on all occasation
+ $suc = ($line =~ s/(\$+)/ $1 /g);
+ }else{
+ ## split only between $ and number
+ $suc = ($line =~ s/(\$+)(\d)/$1 $2/g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "#"
+ if(($line =~ /\#/) && ($Split_On_SharpSign > 0)){
+ my $suc = 0;
+ if($Split_On_SharpSign >= 2){
+ ### keep #50 as a token
+ $suc = ($line =~ s/(\#+)(\D)/ $1 $2/gi);
+ }else{
+ $suc = ($line =~ s/(\#+)/ $1 /gi);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with '
+ if($line =~ /\'/){
+ my $suc = ($line =~ s/([^\'])([\']+)$/$1 $2/g); ## xxx'' => xxx ''
+
+ ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't
+
+ ## 'there => ' there '98 => the same
+ $suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi);
+
+ ## note that \' and \. could interact: e.g., U.S.'s; 're.
+ if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){
+ ## doesn't => does n't
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ ## 's, 't, 'm, 'll, 're, 've: they've => they 've
+ ## 1950's => 1950 's Co.'s => Co. 's
+ if($Split_AposS && ($line =~ /^(.+)(\'s)(\W*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposM && ($line =~ /^(.*[a-z]+)(\'m)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+
+ if($Split_AposRE && ($line =~ /^(.*[a-z]+)(\'re)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposVE && ($line =~ /^(.*[a-z]+)(\'ve)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposLL && ($line =~ /^(.*[a-z]+)(\'ll)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposD && ($line =~ /^(.*[a-z]+)(\'d)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ## deal with "~"
+ if(($line =~ /\~/) && ($Split_On_Tilde > 0)){
+ my $suc = 0;
+ if($Split_On_Tilde >= 2){
+ ## keep 12~13 as one token
+ $suc += ($line =~ s/(\D)(\~+)/$1 $2 /g);
+ $suc += ($line =~ s/(\~+)(\D)/ $1 $2/g);
+ $suc += ($line =~ s/^(\~+)(\d)/$1 $2/g);
+ $suc += ($line =~ s/(\d)(\~+)$/$1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\~+)/ $1 /g);
+ }
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "^"
+ if(($line =~ /\^/) && ($Split_On_Circ > 0)){
+ my $suc = 0;
+ if($Split_On_Circ >= 2){
+ ## keep 12~13 as one token
+ $suc += ($line =~ s/(\D)(\^+)/$1 $2 /g);
+ $suc += ($line =~ s/(\^+)(\D)/ $1 $2/g);
+ }else{
+ $suc = ($line =~ s/(\^+)/ $1 /g);
+ }
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with ":"
+ if(($line =~ /\:/) && ($Split_On_Semicolon > 0)){
+ ## 2: => 2 :
+ my $suc = ($line =~ s/^(\:+)/$1 /);
+ $suc += ($line =~ s/(\:+)$/ $1/);
+ if($Split_On_Semicolon >= 2){
+ ## keep 5:4 as one token
+ $suc += ($line =~ s/(\D)(\:+)/$1 $2 /g);
+ $suc += ($line =~ s/(\:+)(\D)/ $1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\:+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ### deal with hyphen: 1992-1993. 21st-24th
+ if(($line =~ /\-/) && ($Split_On_Dash > 0)){
+ my $suc = ($line =~ s/(\-{2,})/ $1 /g);
+ if($Split_On_Dash >= 2){
+ ## keep 1992-1993 as one token
+ $suc += ($line =~ s/(\D)(\-+)/$1 $2 /g);
+ $suc += ($line =~ s/(\-+)(\D)/ $1 $2/g);
+ }else{
+ ### always split on "-"
+ $suc += ($line =~ s/([\-]+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "_"
+ if(($line =~ /\_/) && ($Split_On_Underscore > 0)){
+ ### always split on "-"
+ if($line =~ s/([\_]+)/ $1 /g){
+ return proc_line($line);
+ }
+ }
+
+
+
+ ## deal with "%"
+ if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){
+ my $suc = 0;
+ if($Split_On_PercentSign >= 2){
+ $suc += ($line =~ s/(\D)(\%+)/$1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\%+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ### deal with "/": 4/5
+ if(($line =~ /\//) && ($Split_On_Slash > 0)){
+ my $suc = 0;
+ if($Split_On_Slash >= 2){
+ $suc += ($line =~ s/(\D)(\/+)/$1 $2 /g);
+ $suc += ($line =~ s/(\/+)(\D)/ $1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\/+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ### deal with comma: 123,456
+ if($line =~ /\,/){
+ my $suc = 0;
+ $suc += ($line =~ s/([^\d]),/$1 , /g); ## xxx, 1923 => xxx , 1923
+ $suc += ($line =~ s/\,\s*([^\d])/ , $1/g); ## 1923, xxx => 1923 , xxx
+
+ $suc += ($line =~ s/,([\d]{1,2}[^\d])/ , $1/g); ## 1,23 => 1 , 23
+ $suc += ($line =~ s/,([\d]{4,}[^\d])/ , $1/g); ## 1,2345 => 1 , 2345
+
+ $suc += ($line =~ s/,([\d]{1,2})$/ , $1/g); ## 1,23 => 1 , 23
+ $suc += ($line =~ s/,([\d]{4,})$/ , $1/g); ## 1,2345 => 1 , 2345
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ## deal with "&"
+ if(($line =~ /\&/) && ($Split_On_AndSign > 0)){
+ my $suc = 0;
+ if($Split_On_AndSign >= 2){
+ $suc += ($line =~ s/([a-z]{3,})(\&+)/$1 $2 /gi);
+ $suc += ($line =~ s/(\&+)([a-z]{3,})/ $1 $2/gi);
+ }else{
+ $suc += ($line =~ s/(\&+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with period
+ if($line =~ /\./){
+ if($line =~ /^(([\+|\-])*(\d+\,)*\d*\.\d+\%*)$/){
+ ### numbers: 3.5
+ return $line;
+ }
+
+ if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){
+ ## I.B.M.
+ my $t1 = $1 . $5;
+ my $t3 = $6;
+ return $t1 . " ". proc_token($t3);
+ }
+
+ ## Feb.. => Feb. .
+ if($line =~ /^(.*[^\.])(\.)(\.*)$/){
+ my $p1 = $1;
+ my $p2 = $2;
+ my $p3 = $3;
+
+ my $p1_lc = $p1;
+ $p1_lc =~ tr/A-Z/a-z/;
+
+ if(defined($dict_hash{$p1_lc . $p2})){
+ ## Dec.. => Dec. .
+ return $p1 . $p2 . " " . proc_token($p3);
+ }elsif(defined($dict_hash{$p1_lc})){
+ return $p1 . " " . proc_token($p2 . $p3);
+ }else{
+ ## this. => this .
+ return proc_token($p1) . " " . proc_token($p2 . $p3);
+ }
+ }
+
+ if($line =~ s/(\.+)(.+)/$1 $2/g){
+ return proc_line($line);
+ }
+ }
+
+
+ ## no pattern applies
+ return $line;
+}
+
diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl
new file mode 100755
index 000000000..e574f861a
--- /dev/null
+++ b/corpus/support/utf8-normalize-batch.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+$|++;
+
+if (scalar(@ARGV) != 1) {
+ print STDERR "usage: $0 \"CMD\"\n";
+ exit(2);
+}
+
+$CMD = $ARGV[0];
+
+while () {
+ s/\r\n*/\n/g;
+ $PID = open2(*SOUT, *SIN, $CMD);
+ print SIN "$_\n";
+ close(SIN);
+ $_ = ;
+ close(SOUT);
+ waitpid($PID, 0);
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+}
diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh
new file mode 100755
index 000000000..af9895ba0
--- /dev/null
+++ b/corpus/support/utf8-normalize.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# this is the location on malbec, if you want to run on another machine
+# ICU may be installed in /usr or /usr/local
+ICU_DIR=/usr0/tools/icu
+UCONV_BIN=$ICU_DIR/bin/uconv
+UCONV_LIB=$ICU_DIR/lib
+
+if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ]
+then
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB
+ if [ ! -x $UCONV_BIN ]
+ then
+ echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2
+ exit
+ fi
+ CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip"
+else
+ if which uconv > /dev/null
+ then
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+ else
+ echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2
+ CMD="iconv -f utf8 -t utf8 -c"
+ fi
+fi
+
+if [[ $# == 1 && $1 == "--batchline" ]]; then
+ perl $(dirname $0)/utf8-normalize-batch.pl "$CMD"
+else
+ perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' \
+ |$CMD \
+ |/usr/bin/perl -w -e '
+ $|++;
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
+fi
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
new file mode 100755
index 000000000..bca954d15
--- /dev/null
+++ b/corpus/tokenize-anything.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+ROOTDIR=`dirname $0`
+SUPPORT=$ROOTDIR/support
+
+if [[ $# == 1 && $1 == '-u' ]] ; then
+ NORMARGS="--batchline"
+ SEDFLAGS="-u"
+else
+ NORMARGS=""
+ SEDFLAGS=""
+fi
+
+$SUPPORT/utf8-normalize.sh $NORMARGS |
+ $SUPPORT/quote-norm.pl |
+ $SUPPORT/tokenizer.pl |
+ $SUPPORT/fix-eos.pl |
+ sed $SEDFLAGS -e 's/ al - / al-/g' |
+ $SUPPORT/fix-contract.pl |
+ sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' |
+ perl -e '$|++; while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/; print;}'
+
diff --git a/corpus/untok.pl b/corpus/untok.pl
new file mode 100755
index 000000000..723e78cbe
--- /dev/null
+++ b/corpus/untok.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl -w
+
+use IO::Handle;
+STDOUT->autoflush(1);
+
+while (<>) {
+ $output = "";
+ @tokens = split;
+ $lspace = 0;
+ $qflag = 0;
+ for ($i=0; $i<=$#tokens; $i++) {
+ $token = $tokens[$i];
+ $prev = $next = "";
+ $rspace = 1;
+ if ($i > 0) {
+ $prev = $tokens[$i-1];
+ }
+ if ($i < $#tokens) {
+ $next = $tokens[$i+1];
+ }
+
+ # possessives join to the left
+ if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) {
+ $lspace = 0;
+ } elsif ($token eq "'" && $prev =~ /s$/) {
+ $lspace = 0;
+
+ # hyphen only when a hyphen, not a dash
+ } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) {
+ $lspace = $rspace = 0;
+
+ # quote marks alternate
+ } elsif ($token eq '"') {
+ if ($qflag) {
+ $lspace = 0;
+ } else {
+ $rspace = 0;
+ }
+ $qflag = !$qflag;
+
+ # period joins on both sides when a decimal point
+ } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) {
+ $lspace = $rspace = 0;
+
+ # Left joiners
+ } elsif ($token =~ /^[.,:;?!%)\]]$/) {
+ $lspace = 0;
+ # Right joiners
+ } elsif ($token =~ /^[$(\[]$/) {
+ $rspace = 0;
+ # Joiners on both sides
+ } elsif ($token =~ /^[\/]$/) {
+ $lspace = $rspace = 0;
+ }
+
+ if ($lspace) {
+ $output .= " ";
+ }
+ $output .= $token;
+ $lspace = $rspace;
+ }
+ print "$output\n";
+}
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
new file mode 100755
index 000000000..dcf8bc59d
--- /dev/null
+++ b/corpus/utf8-normalize.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script uses ICU uconv (http://site.icu-project.org/), if it's available
+# to normalize UTF8 text into a standard form. For information about this
+# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
+# Escape characters between 0x00-0x1F are removed
+
+if which uconv > /dev/null
+then
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+else
+ echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
+ CMD="iconv -f utf8 -t utf8 -c"
+fi
+
+$CMD | /usr/bin/perl -w -e '
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
+
diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py
new file mode 100755
index 000000000..4357ced63
--- /dev/null
+++ b/corpus/xml-tok.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import os
+import re
+import subprocess
+import sys
+
+# Tokenize XML files with tokenize-anything.sh
+# in: The earnings on its 10-year bonds are 28.45%.
+# out: The earnings on its 10 - year bonds are 28.45 % .
+
+def escape(s):
+ return s.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"').replace('\'', ''')
+
+def unescape(s):
+ return s.replace('>', '>').replace('<', '<').replace('"', '"').replace(''', '\'').replace('&', '&')
+
+def main():
+ tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip()
+ pieces = []
+ eol = len(line)
+ pos = 0
+ while pos < eol:
+ next = line.find('<', pos)
+ if next == -1:
+ next = eol
+ tok.stdin.write('{}\n'.format(unescape(line[pos:next])))
+ pieces.append(escape(tok.stdout.readline().strip()))
+ if next == eol:
+ break
+ pos = line.find('>', next + 1)
+ if pos == -1:
+ pos = eol
+ else:
+ pos += 1
+ pieces.append(line[next:pos])
+ sys.stdout.write('{}\n'.format(' '.join(pieces).strip()))
+ tok.stdin.close()
+ tok.wait()
+
+if __name__ == '__main__':
+ main()
diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c
new file mode 100644
index 000000000..5e392bc6c
--- /dev/null
+++ b/decoder/JSON_parser.c
@@ -0,0 +1,1012 @@
+/* JSON_parser.c */
+
+/* 2007-08-24 */
+
+/*
+Copyright (c) 2005 JSON.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+The Software shall be used for Good, not Evil.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+/*
+ Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009.
+
+ For the added features the license above applies also.
+
+ Changelog:
+ 2009-05-17
+ Incorporated benrudiak@googlemail.com fix for UTF16 decoding.
+
+ 2009-05-14
+ Fixed float parsing bug related to a locale being set that didn't
+ use '.' as decimal point character (charles@transmissionbt.com).
+
+ 2008-10-14
+ Renamed states.IN to states.IT to avoid name clash which IN macro
+ defined in windef.h (alexey.pelykh@gmail.com)
+
+ 2008-07-19
+ Removed some duplicate code & debugging variable (charles@transmissionbt.com)
+
+ 2008-05-28
+ Made JSON_value structure ansi C compliant. This bug was report by
+ trisk@acm.jhu.edu
+
+ 2008-05-20
+ Fixed bug reported by charles@transmissionbt.com where the switching
+ from static to dynamic parse buffer did not copy the static parse
+ buffer's content.
+*/
+
+
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "JSON_parser.h"
+
+#ifdef _MSC_VER
+# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */
+# pragma warning(disable:4996) // unsecure sscanf
+# endif
+#endif
+
+
+#define true 1
+#define false 0
+#define __ -1 /* the universal error code */
+
+/* values chosen so that the object size is approx equal to one page (4K) */
+#ifndef JSON_PARSER_STACK_SIZE
+# define JSON_PARSER_STACK_SIZE 128
+#endif
+
+#ifndef JSON_PARSER_PARSE_BUFFER_SIZE
+# define JSON_PARSER_PARSE_BUFFER_SIZE 3500
+#endif
+
+typedef unsigned short UTF16;
+
+struct JSON_parser_struct {
+ JSON_parser_callback callback;
+ void* ctx;
+ signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually;
+ UTF16 utf16_high_surrogate;
+ long depth;
+ long top;
+ signed char* stack;
+ long stack_capacity;
+ char decimal_point;
+ char* parse_buffer;
+ size_t parse_buffer_capacity;
+ size_t parse_buffer_count;
+ size_t comment_begin_offset;
+ signed char static_stack[JSON_PARSER_STACK_SIZE];
+ char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE];
+};
+
+#define COUNTOF(x) (sizeof(x)/sizeof(x[0]))
+
+/*
+ Characters are mapped into these character classes. This allows for
+ a significant reduction in the size of the state transition table.
+*/
+
+
+
+enum classes {
+ C_SPACE, /* space */
+ C_WHITE, /* other whitespace */
+ C_LCURB, /* { */
+ C_RCURB, /* } */
+ C_LSQRB, /* [ */
+ C_RSQRB, /* ] */
+ C_COLON, /* : */
+ C_COMMA, /* , */
+ C_QUOTE, /* " */
+ C_BACKS, /* \ */
+ C_SLASH, /* / */
+ C_PLUS, /* + */
+ C_MINUS, /* - */
+ C_POINT, /* . */
+ C_ZERO , /* 0 */
+ C_DIGIT, /* 123456789 */
+ C_LOW_A, /* a */
+ C_LOW_B, /* b */
+ C_LOW_C, /* c */
+ C_LOW_D, /* d */
+ C_LOW_E, /* e */
+ C_LOW_F, /* f */
+ C_LOW_L, /* l */
+ C_LOW_N, /* n */
+ C_LOW_R, /* r */
+ C_LOW_S, /* s */
+ C_LOW_T, /* t */
+ C_LOW_U, /* u */
+ C_ABCDF, /* ABCDF */
+ C_E, /* E */
+ C_ETC, /* everything else */
+ C_STAR, /* * */
+ NR_CLASSES
+};
+
+static int ascii_class[128] = {
+/*
+ This array maps the 128 ASCII characters into character classes.
+ The remaining Unicode characters should be mapped to C_ETC.
+ Non-whitespace control characters are errors.
+*/
+ __, __, __, __, __, __, __, __,
+ __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __,
+ __, __, __, __, __, __, __, __,
+ __, __, __, __, __, __, __, __,
+
+ C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH,
+ C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT,
+ C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+
+ C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC,
+
+ C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC,
+ C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC
+};
+
+
+/*
+ The state codes.
+*/
+enum states {
+ GO, /* start */
+ OK, /* ok */
+ OB, /* object */
+ KE, /* key */
+ CO, /* colon */
+ VA, /* value */
+ AR, /* array */
+ ST, /* string */
+ ES, /* escape */
+ U1, /* u1 */
+ U2, /* u2 */
+ U3, /* u3 */
+ U4, /* u4 */
+ MI, /* minus */
+ ZE, /* zero */
+ IT, /* integer */
+ FR, /* fraction */
+ E1, /* e */
+ E2, /* ex */
+ E3, /* exp */
+ T1, /* tr */
+ T2, /* tru */
+ T3, /* true */
+ F1, /* fa */
+ F2, /* fal */
+ F3, /* fals */
+ F4, /* false */
+ N1, /* nu */
+ N2, /* nul */
+ N3, /* null */
+ C1, /* / */
+ C2, /* / * */
+ C3, /* * */
+ FX, /* *.* *eE* */
+ D1, /* second UTF-16 character decoding started by \ */
+ D2, /* second UTF-16 character proceeded by u */
+ NR_STATES
+};
+
+enum actions
+{
+ CB = -10, /* comment begin */
+ CE = -11, /* comment end */
+ FA = -12, /* false */
+ TR = -13, /* false */
+ NU = -14, /* null */
+ DE = -15, /* double detected by exponent e E */
+ DF = -16, /* double detected by fraction . */
+ SB = -17, /* string begin */
+ MX = -18, /* integer detected by minus */
+ ZX = -19, /* integer detected by zero */
+ IX = -20, /* integer detected by 1-9 */
+ EX = -21, /* next char is escaped */
+ UC = -22 /* Unicode character read */
+};
+
+
+static int state_transition_table[NR_STATES][NR_CLASSES] = {
+/*
+ The state transition table takes the current state and the current symbol,
+ and returns either a new state or an action. An action is represented as a
+ negative number. A JSON text is accepted if at the end of the text the
+ state is OK and if the mode is MODE_DONE.
+
+ white 1-9 ABCDF etc
+ space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */
+/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__},
+/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__},
+/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST},
+/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__},
+/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__},
+/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__},
+/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__},
+/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__},
+/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__},
+/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__},
+/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__},
+/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__},
+/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__},
+/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__},
+/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__},
+/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__},
+/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__},
+/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__},
+/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__},
+/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2},
+/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3},
+/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3},
+/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__},
+/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__},
+};
+
+
+/*
+ These modes can be pushed on the stack.
+*/
+enum modes {
+ MODE_ARRAY = 1,
+ MODE_DONE = 2,
+ MODE_KEY = 3,
+ MODE_OBJECT = 4
+};
+
+static int
+push(JSON_parser jc, int mode)
+{
+/*
+ Push a mode onto the stack. Return false if there is overflow.
+*/
+ jc->top += 1;
+ if (jc->depth < 0) {
+ if (jc->top >= jc->stack_capacity) {
+ size_t bytes_to_allocate;
+ jc->stack_capacity *= 2;
+ bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]);
+ if (jc->stack == &jc->static_stack[0]) {
+ jc->stack = (signed char*)malloc(bytes_to_allocate);
+ memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack));
+ } else {
+ jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate);
+ }
+ }
+ } else {
+ if (jc->top >= jc->depth) {
+ return false;
+ }
+ }
+
+ jc->stack[jc->top] = mode;
+ return true;
+}
+
+
+static int
+pop(JSON_parser jc, int mode)
+{
+/*
+ Pop the stack, assuring that the current mode matches the expectation.
+ Return false if there is underflow or if the modes mismatch.
+*/
+ if (jc->top < 0 || jc->stack[jc->top] != mode) {
+ return false;
+ }
+ jc->top -= 1;
+ return true;
+}
+
+
+#define parse_buffer_clear(jc) \
+ do {\
+ jc->parse_buffer_count = 0;\
+ jc->parse_buffer[0] = 0;\
+ } while (0)
+
+#define parse_buffer_pop_back_char(jc)\
+ do {\
+ assert(jc->parse_buffer_count >= 1);\
+ --jc->parse_buffer_count;\
+ jc->parse_buffer[jc->parse_buffer_count] = 0;\
+ } while (0)
+
+void delete_JSON_parser(JSON_parser jc)
+{
+ if (jc) {
+ if (jc->stack != &jc->static_stack[0]) {
+ free((void*)jc->stack);
+ }
+ if (jc->parse_buffer != &jc->static_parse_buffer[0]) {
+ free((void*)jc->parse_buffer);
+ }
+ free((void*)jc);
+ }
+}
+
+
+JSON_parser
+new_JSON_parser(JSON_config* config)
+{
+/*
+ new_JSON_parser starts the checking process by constructing a JSON_parser
+ object. It takes a depth parameter that restricts the level of maximum
+ nesting.
+
+ To continue the process, call JSON_parser_char for each character in the
+ JSON text, and then call JSON_parser_done to obtain the final result.
+ These functions are fully reentrant.
+*/
+
+ int depth = 0;
+ JSON_config default_config;
+
+ JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct));
+
+ memset(jc, 0, sizeof(*jc));
+
+
+ /* initialize configuration */
+ init_JSON_config(&default_config);
+
+ /* set to default configuration if none was provided */
+ if (config == NULL) {
+ config = &default_config;
+ }
+
+ depth = config->depth;
+
+ /* We need to be able to push at least one object */
+ if (depth == 0) {
+ depth = 1;
+ }
+
+ jc->state = GO;
+ jc->top = -1;
+
+ /* Do we want non-bound stack? */
+ if (depth > 0) {
+ jc->stack_capacity = depth;
+ jc->depth = depth;
+ if (depth <= (int)COUNTOF(jc->static_stack)) {
+ jc->stack = &jc->static_stack[0];
+ } else {
+ jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0]));
+ }
+ } else {
+ jc->stack_capacity = COUNTOF(jc->static_stack);
+ jc->depth = -1;
+ jc->stack = &jc->static_stack[0];
+ }
+
+ /* set parser to start */
+ push(jc, MODE_DONE);
+
+ /* set up the parse buffer */
+ jc->parse_buffer = &jc->static_parse_buffer[0];
+ jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer);
+ parse_buffer_clear(jc);
+
+ /* set up callback, comment & float handling */
+ jc->callback = config->callback;
+ jc->ctx = config->callback_ctx;
+ jc->allow_comments = config->allow_comments != 0;
+ jc->handle_floats_manually = config->handle_floats_manually != 0;
+
+ /* set up decimal point */
+ jc->decimal_point = *localeconv()->decimal_point;
+
+ return jc;
+}
+
+static void grow_parse_buffer(JSON_parser jc)
+{
+ size_t bytes_to_allocate;
+ jc->parse_buffer_capacity *= 2;
+ bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]);
+ if (jc->parse_buffer == &jc->static_parse_buffer[0]) {
+ jc->parse_buffer = (char*)malloc(bytes_to_allocate);
+ memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count);
+ } else {
+ jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate);
+ }
+}
+
+#define parse_buffer_push_back_char(jc, c)\
+ do {\
+ if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\
+ jc->parse_buffer[jc->parse_buffer_count++] = c;\
+ jc->parse_buffer[jc->parse_buffer_count] = 0;\
+ } while (0)
+
+#define assert_is_non_container_type(jc) \
+ assert( \
+ jc->type == JSON_T_NULL || \
+ jc->type == JSON_T_FALSE || \
+ jc->type == JSON_T_TRUE || \
+ jc->type == JSON_T_FLOAT || \
+ jc->type == JSON_T_INTEGER || \
+ jc->type == JSON_T_STRING)
+
+
+static int parse_parse_buffer(JSON_parser jc)
+{
+ if (jc->callback) {
+ JSON_value value, *arg = NULL;
+
+ if (jc->type != JSON_T_NONE) {
+ assert_is_non_container_type(jc);
+
+ switch(jc->type) {
+ case JSON_T_FLOAT:
+ arg = &value;
+ if (jc->handle_floats_manually) {
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ } else {
+ /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/
+
+ /* not checking with end pointer b/c there may be trailing ws */
+ value.vu.float_value = strtod(jc->parse_buffer, NULL);
+ }
+ break;
+ case JSON_T_INTEGER:
+ arg = &value;
+ sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value);
+ break;
+ case JSON_T_STRING:
+ arg = &value;
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ break;
+ }
+
+ if (!(*jc->callback)(jc->ctx, jc->type, arg)) {
+ return false;
+ }
+ }
+ }
+
+ parse_buffer_clear(jc);
+
+ return true;
+}
+
+#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
+#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00)
+#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
+static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
+
+static int decode_unicode_char(JSON_parser jc)
+{
+ int i;
+ unsigned uc = 0;
+ char* p;
+ int trail_bytes;
+
+ assert(jc->parse_buffer_count >= 6);
+
+ p = &jc->parse_buffer[jc->parse_buffer_count - 4];
+
+ for (i = 12; i >= 0; i -= 4, ++p) {
+ unsigned x = *p;
+
+ if (x >= 'a') {
+ x -= ('a' - 10);
+ } else if (x >= 'A') {
+ x -= ('A' - 10);
+ } else {
+ x &= ~0x30u;
+ }
+
+ assert(x < 16);
+
+ uc |= x << i;
+ }
+
+ /* clear UTF-16 char from buffer */
+ jc->parse_buffer_count -= 6;
+ jc->parse_buffer[jc->parse_buffer_count] = 0;
+
+ /* attempt decoding ... */
+ if (jc->utf16_high_surrogate) {
+ if (IS_LOW_SURROGATE(uc)) {
+ uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc);
+ trail_bytes = 3;
+ jc->utf16_high_surrogate = 0;
+ } else {
+ /* high surrogate without a following low surrogate */
+ return false;
+ }
+ } else {
+ if (uc < 0x80) {
+ trail_bytes = 0;
+ } else if (uc < 0x800) {
+ trail_bytes = 1;
+ } else if (IS_HIGH_SURROGATE(uc)) {
+ /* save the high surrogate and wait for the low surrogate */
+ jc->utf16_high_surrogate = uc;
+ return true;
+ } else if (IS_LOW_SURROGATE(uc)) {
+ /* low surrogate without a preceding high surrogate */
+ return false;
+ } else {
+ trail_bytes = 2;
+ }
+ }
+
+ jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]);
+
+ for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) {
+ jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80);
+ }
+
+ jc->parse_buffer[jc->parse_buffer_count] = 0;
+
+ return true;
+}
+
+static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char)
+{
+ jc->escaped = 0;
+ /* remove the backslash */
+ parse_buffer_pop_back_char(jc);
+ switch(next_char) {
+ case 'b':
+ parse_buffer_push_back_char(jc, '\b');
+ break;
+ case 'f':
+ parse_buffer_push_back_char(jc, '\f');
+ break;
+ case 'n':
+ parse_buffer_push_back_char(jc, '\n');
+ break;
+ case 'r':
+ parse_buffer_push_back_char(jc, '\r');
+ break;
+ case 't':
+ parse_buffer_push_back_char(jc, '\t');
+ break;
+ case '"':
+ parse_buffer_push_back_char(jc, '"');
+ break;
+ case '\\':
+ parse_buffer_push_back_char(jc, '\\');
+ break;
+ case '/':
+ parse_buffer_push_back_char(jc, '/');
+ break;
+ case 'u':
+ parse_buffer_push_back_char(jc, '\\');
+ parse_buffer_push_back_char(jc, 'u');
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+#define add_char_to_parse_buffer(jc, next_char, next_class) \
+ do { \
+ if (jc->escaped) { \
+ if (!add_escaped_char_to_parse_buffer(jc, next_char)) \
+ return false; \
+ } else if (!jc->comment) { \
+ if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \
+ parse_buffer_push_back_char(jc, (char)next_char); \
+ } \
+ } \
+ } while (0)
+
+
+#define assert_type_isnt_string_null_or_bool(jc) \
+ assert(jc->type != JSON_T_FALSE); \
+ assert(jc->type != JSON_T_TRUE); \
+ assert(jc->type != JSON_T_NULL); \
+ assert(jc->type != JSON_T_STRING)
+
+
+int
+JSON_parser_char(JSON_parser jc, int next_char)
+{
+/*
+ After calling new_JSON_parser, call this function for each character (or
+ partial character) in your JSON text. It can accept UTF-8, UTF-16, or
+ UTF-32. It returns true if things are looking ok so far. If it rejects the
+ text, it returns false.
+*/
+ int next_class, next_state;
+
+/*
+ Determine the character's class.
+*/
+ if (next_char < 0) {
+ return false;
+ }
+ if (next_char >= 128) {
+ next_class = C_ETC;
+ } else {
+ next_class = ascii_class[next_char];
+ if (next_class <= __) {
+ return false;
+ }
+ }
+
+ add_char_to_parse_buffer(jc, next_char, next_class);
+
+/*
+ Get the next state from the state transition table.
+*/
+ next_state = state_transition_table[jc->state][next_class];
+ if (next_state >= 0) {
+/*
+ Change the state.
+*/
+ jc->state = next_state;
+ } else {
+/*
+ Or perform one of the actions.
+*/
+ switch (next_state) {
+/* Unicode character */
+ case UC:
+ if(!decode_unicode_char(jc)) {
+ return false;
+ }
+ /* check if we need to read a second UTF-16 char */
+ if (jc->utf16_high_surrogate) {
+ jc->state = D1;
+ } else {
+ jc->state = ST;
+ }
+ break;
+/* escaped char */
+ case EX:
+ jc->escaped = 1;
+ jc->state = ES;
+ break;
+/* integer detected by minus */
+ case MX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = MI;
+ break;
+/* integer detected by zero */
+ case ZX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = ZE;
+ break;
+/* integer detected by 1-9 */
+ case IX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = IT;
+ break;
+
+/* floating point number detected by exponent*/
+ case DE:
+ assert_type_isnt_string_null_or_bool(jc);
+ jc->type = JSON_T_FLOAT;
+ jc->state = E1;
+ break;
+
+/* floating point number detected by fraction */
+ case DF:
+ assert_type_isnt_string_null_or_bool(jc);
+ if (!jc->handle_floats_manually) {
+/*
+ Some versions of strtod (which underlies sscanf) don't support converting
+ C-locale formated floating point values.
+*/
+ assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.');
+ jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point;
+ }
+ jc->type = JSON_T_FLOAT;
+ jc->state = FX;
+ break;
+/* string begin " */
+ case SB:
+ parse_buffer_clear(jc);
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_STRING;
+ jc->state = ST;
+ break;
+
+/* n */
+ case NU:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_NULL;
+ jc->state = N1;
+ break;
+/* f */
+ case FA:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_FALSE;
+ jc->state = F1;
+ break;
+/* t */
+ case TR:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_TRUE;
+ jc->state = T1;
+ break;
+
+/* closing comment */
+ case CE:
+ jc->comment = 0;
+ assert(jc->parse_buffer_count == 0);
+ assert(jc->type == JSON_T_NONE);
+ jc->state = jc->before_comment_state;
+ break;
+
+/* opening comment */
+ case CB:
+ if (!jc->allow_comments) {
+ return false;
+ }
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ assert(jc->parse_buffer_count == 0);
+ assert(jc->type != JSON_T_STRING);
+ switch (jc->stack[jc->top]) {
+ case MODE_ARRAY:
+ case MODE_OBJECT:
+ switch(jc->state) {
+ case VA:
+ case AR:
+ jc->before_comment_state = jc->state;
+ break;
+ default:
+ jc->before_comment_state = OK;
+ break;
+ }
+ break;
+ default:
+ jc->before_comment_state = jc->state;
+ break;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = C1;
+ jc->comment = 1;
+ break;
+/* empty } */
+ case -9:
+ parse_buffer_clear(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_KEY)) {
+ return false;
+ }
+ jc->state = OK;
+ break;
+
+/* } */ case -8:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_OBJECT)) {
+ return false;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+
+/* ] */ case -7:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_ARRAY)) {
+ return false;
+ }
+
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+
+/* { */ case -6:
+ parse_buffer_pop_back_char(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) {
+ return false;
+ }
+ if (!push(jc, MODE_KEY)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = OB;
+ break;
+
+/* [ */ case -5:
+ parse_buffer_pop_back_char(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) {
+ return false;
+ }
+ if (!push(jc, MODE_ARRAY)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = AR;
+ break;
+
+/* string end " */ case -4:
+ parse_buffer_pop_back_char(jc);
+ switch (jc->stack[jc->top]) {
+ case MODE_KEY:
+ assert(jc->type == JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = CO;
+
+ if (jc->callback) {
+ JSON_value value;
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) {
+ return false;
+ }
+ }
+ parse_buffer_clear(jc);
+ break;
+ case MODE_ARRAY:
+ case MODE_OBJECT:
+ assert(jc->type == JSON_T_STRING);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+ default:
+ return false;
+ }
+ break;
+
+/* , */ case -3:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ switch (jc->stack[jc->top]) {
+ case MODE_OBJECT:
+/*
+ A comma causes a flip from object mode to key mode.
+*/
+ if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) {
+ return false;
+ }
+ assert(jc->type != JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = KE;
+ break;
+ case MODE_ARRAY:
+ assert(jc->type != JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = VA;
+ break;
+ default:
+ return false;
+ }
+ break;
+
+/* : */ case -2:
+/*
+ A colon causes a flip from key mode to object mode.
+*/
+ parse_buffer_pop_back_char(jc);
+ if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = VA;
+ break;
+/*
+ Bad action.
+*/
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
+
+int
+JSON_parser_done(JSON_parser jc)
+{
+ const int result = jc->state == OK && pop(jc, MODE_DONE);
+
+ return result;
+}
+
+
+int JSON_parser_is_legal_white_space_string(const char* s)
+{
+ int c, char_class;
+
+ if (s == NULL) {
+ return false;
+ }
+
+ for (; *s; ++s) {
+ c = *s;
+
+ if (c < 0 || c >= 128) {
+ return false;
+ }
+
+ char_class = ascii_class[c];
+
+ if (char_class != C_SPACE && char_class != C_WHITE) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+
+void init_JSON_config(JSON_config* config)
+{
+ if (config) {
+ memset(config, 0, sizeof(*config));
+
+ config->depth = JSON_PARSER_STACK_SIZE - 1;
+ }
+}
diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h
new file mode 100644
index 000000000..de9800721
--- /dev/null
+++ b/decoder/JSON_parser.h
@@ -0,0 +1,152 @@
+#ifndef JSON_PARSER_H
+#define JSON_PARSER_H
+
+/* JSON_parser.h */
+
+
+#include
+
+/* Windows DLL stuff */
+#ifdef _WIN32
+# ifdef JSON_PARSER_DLL_EXPORTS
+# define JSON_PARSER_DLL_API __declspec(dllexport)
+# else
+# define JSON_PARSER_DLL_API __declspec(dllimport)
+# endif
+#else
+# define JSON_PARSER_DLL_API
+#endif
+
+/* Determine the integer type use to parse non-floating point numbers */
+#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1
+typedef long long JSON_int_t;
+#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld"
+#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld"
+#else
+typedef long JSON_int_t;
+#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld"
+#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld"
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum
+{
+ JSON_T_NONE = 0,
+ JSON_T_ARRAY_BEGIN, // 1
+ JSON_T_ARRAY_END, // 2
+ JSON_T_OBJECT_BEGIN, // 3
+ JSON_T_OBJECT_END, // 4
+ JSON_T_INTEGER, // 5
+ JSON_T_FLOAT, // 6
+ JSON_T_NULL, // 7
+ JSON_T_TRUE, // 8
+ JSON_T_FALSE, // 9
+ JSON_T_STRING, // 10
+ JSON_T_KEY, // 11
+ JSON_T_MAX // 12
+} JSON_type;
+
+typedef struct JSON_value_struct {
+ union {
+ JSON_int_t integer_value;
+
+ double float_value;
+
+ struct {
+ const char* value;
+ size_t length;
+ } str;
+ } vu;
+} JSON_value;
+
+typedef struct JSON_parser_struct* JSON_parser;
+
+/*! \brief JSON parser callback
+
+ \param ctx The pointer passed to new_JSON_parser.
+ \param type An element of JSON_type but not JSON_T_NONE.
+ \param value A representation of the parsed value. This parameter is NULL for
+ JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END,
+ JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned
+ as zero-terminated C strings.
+
+ \return Non-zero if parsing should continue, else zero.
+*/
+typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value);
+
+
+/*! \brief The structure used to configure a JSON parser object
+
+ \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise
+ the depth is the limit
+ \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity.
+ \param Callback context. This parameter may be NULL.
+ \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting.
+ \param allowComments. To allow C style comments in JSON, set to non-zero.
+ \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero.
+
+ \return The parser object.
+*/
+typedef struct {
+ JSON_parser_callback callback;
+ void* callback_ctx;
+ int depth;
+ int allow_comments;
+ int handle_floats_manually;
+} JSON_config;
+
+
+/*! \brief Initializes the JSON parser configuration structure to default values.
+
+ The default configuration is
+ - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c)
+ - no parsing, just checking for JSON syntax
+ - no comments
+
+ \param config. Used to configure the parser.
+*/
+JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config);
+
+/*! \brief Create a JSON parser object
+
+ \param config. Used to configure the parser. Set to NULL to use the default configuration.
+ See init_JSON_config
+
+ \return The parser object.
+*/
+JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config);
+
+/*! \brief Destroy a previously created JSON parser object. */
+JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc);
+
+/*! \brief Parse a character.
+
+ \return Non-zero, if all characters passed to this function are part of are valid JSON.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char);
+
+/*! \brief Finalize parsing.
+
+ Call this method once after all input characters have been consumed.
+
+ \return Non-zero, if all parsed characters are valid JSON, zero otherwise.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc);
+
+/*! \brief Determine if a given string is valid JSON white space
+
+ \return Non-zero if the string is valid, zero otherwise.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* JSON_PARSER_H */
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
new file mode 100644
index 000000000..c85f17ed5
--- /dev/null
+++ b/decoder/Makefile.am
@@ -0,0 +1,160 @@
+bin_PROGRAMS = cdec
+
+noinst_PROGRAMS = \
+ trule_test \
+ hg_test \
+ parser_test \
+ t2s_test \
+ grammar_test
+
+TESTS = trule_test parser_test grammar_test hg_test
+t2s_test_SOURCES = t2s_test.cc
+t2s_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
+parser_test_SOURCES = parser_test.cc
+parser_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
+grammar_test_SOURCES = grammar_test.cc
+grammar_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
+hg_test_SOURCES = hg_test.cc
+hg_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
+trule_test_SOURCES = trule_test.cc
+trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
+
+cdec_SOURCES = cdec.cc
+cdec_LDFLAGS= -rdynamic
+cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../klm/util/double-conversion/libklm_util_double.a
+
+AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/decoder/test_data\" -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/mteval -I$(top_srcdir)/utils -I$(top_srcdir)/klm
+
+rule_lexer.cc: rule_lexer.ll
+ $(LEX) -s -CF -8 -o$@ $<
+
+noinst_LIBRARIES = libcdec.a
+
+EXTRA_DIST = test_data rule_lexer.ll
+
+libcdec_a_SOURCES = \
+ JSON_parser.h \
+ aligner.h \
+ apply_models.h \
+ bottom_up_parser.h \
+ csplit.h \
+ decoder.h \
+ earley_composer.h \
+ factored_lexicon_helper.h \
+ ff.h \
+ ff_basic.h \
+ ff_bleu.h \
+ ff_charset.h \
+ ff_context.h \
+ ff_csplit.h \
+ ff_external.h \
+ ff_factory.h \
+ ff_klm.h \
+ ff_lm.h \
+ ff_ngrams.h \
+ ff_parse_match.h \
+ ff_register.h \
+ ff_rules.h \
+ ff_ruleshape.h \
+ ff_sample_fsa.h \
+ ff_soft_syntax.h \
+ ff_soft_syntax_mindist.h \
+ ff_source_path.h \
+ ff_source_syntax.h \
+ ff_source_syntax2.h \
+ ff_spans.h \
+ ff_tagger.h \
+ ff_wordalign.h \
+ ff_wordset.h \
+ ffset.h \
+ forest_writer.h \
+ freqdict.h \
+ grammar.h \
+ hg.h \
+ hg_intersect.h \
+ hg_io.h \
+ hg_remove_eps.h \
+ hg_sampler.h \
+ hg_test.h \
+ hg_union.h \
+ incremental.h \
+ inside_outside.h \
+ json_parse.h \
+ kbest.h \
+ lattice.h \
+ lexalign.h \
+ lextrans.h \
+ nt_span.h \
+ oracle_bleu.h \
+ phrasebased_translator.h \
+ phrasetable_fst.h \
+ program_options.h \
+ rule_lexer.h \
+ sentence_metadata.h \
+ sentences.h \
+ tagger.h \
+ translator.h \
+ trule.h \
+ viterbi.h \
+ aligner.cc \
+ apply_models.cc \
+ bottom_up_parser.cc \
+ cdec.cc \
+ cdec_ff.cc \
+ csplit.cc \
+ decoder.cc \
+ earley_composer.cc \
+ factored_lexicon_helper.cc \
+ ff.cc \
+ ff_basic.cc \
+ ff_bleu.cc \
+ ff_charset.cc \
+ ff_context.cc \
+ ff_csplit.cc \
+ ff_external.cc \
+ ff_factory.cc \
+ ff_klm.cc \
+ ff_lm.cc \
+ ff_ngrams.cc \
+ ff_parse_match.cc \
+ ff_rules.cc \
+ ff_ruleshape.cc \
+ ff_soft_syntax.cc \
+ ff_soft_syntax_mindist.cc \
+ ff_source_path.cc \
+ ff_source_syntax.cc \
+ ff_source_syntax2.cc \
+ ff_spans.cc \
+ ff_tagger.cc \
+ ff_wordalign.cc \
+ ff_wordset.cc \
+ ffset.cc \
+ forest_writer.cc \
+ fst_translator.cc \
+ tree2string_translator.cc \
+ grammar.cc \
+ hg.cc \
+ hg_intersect.cc \
+ hg_io.cc \
+ hg_remove_eps.cc \
+ hg_sampler.cc \
+ hg_union.cc \
+ incremental.cc \
+ json_parse.cc \
+ lattice.cc \
+ lexalign.cc \
+ lextrans.cc \
+ node_state_hash.h \
+ tree_fragment.cc \
+ tree_fragment.h \
+ maxtrans_blunsom.cc \
+ phrasebased_translator.cc \
+ phrasetable_fst.cc \
+ rescore_translator.cc \
+ rule_lexer.cc \
+ scfg_translator.cc \
+ tagger.cc \
+ translator.cc \
+ trule.cc \
+ viterbi.cc \
+ JSON_parser.c
diff --git a/decoder/aligner.cc b/decoder/aligner.cc
new file mode 100644
index 000000000..232e022ad
--- /dev/null
+++ b/decoder/aligner.cc
@@ -0,0 +1,306 @@
+#include "aligner.h"
+
+#include
+#include
+
+#include
+
+#include "array2d.h"
+#include "hg.h"
+#include "kbest.h"
+#include "sentence_metadata.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include "alignment_io.h"
+
+using namespace std;
+
+// used with lexical models since they may not fully generate the
+// source string
+void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g,
+ vector >* src_cov) {
+ src_cov->clear();
+ src_cov->resize(g.edges_.size());
+
+ for (int i = 0; i < g.edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = g.edges_[i];
+ set& cov = (*src_cov)[i];
+ // no words
+ if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0)
+ continue;
+ // aligned to NULL (crf ibm variant only)
+ if (edge.prev_i_ == -1 || edge.i_ == -1) {
+ cov.insert(-1);
+ continue;
+ }
+ assert(edge.j_ >= 0);
+ assert(edge.prev_j_ >= 0);
+ if (edge.Arity() == 0) {
+ for (int k = edge.prev_i_; k < edge.prev_j_; ++k)
+ cov.insert(k);
+ } else {
+ // note: this code, which handles mixed NT and terminal
+ // rules assumes that nodes uniquely define a src and trg
+ // span.
+ int k = edge.prev_i_;
+ int j = 0;
+ const vector& f = edge.rule_->e(); // rules are inverted
+ while (k < edge.prev_j_) {
+ if (f[j] > 0) {
+ cov.insert(k);
+ // cerr << "src: " << k << endl;
+ ++k;
+ ++j;
+ } else {
+ const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]];
+ assert(tailnode.in_edges_.size() > 0);
+ // any edge will do:
+ const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()];
+ //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span
+ k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span
+ ++j;
+ }
+ }
+ }
+ }
+}
+
+int SourceEdgeCoveragesUsingTree(const Hypergraph& g,
+ int node_id,
+ int span_start,
+ vector* spans,
+ vector >* src_cov) {
+ const Hypergraph::Node& node = g.nodes_[node_id];
+ int k = -1;
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const int edge_id = node.in_edges_[i];
+ const Hypergraph::Edge& edge = g.edges_[edge_id];
+ set& cov = (*src_cov)[edge_id];
+ const vector& f = edge.rule_->e(); // rules are inverted
+ int j = 0;
+ k = span_start;
+ while (j < f.size()) {
+ if (f[j] > 0) {
+ cov.insert(k);
+ ++k;
+ ++j;
+ } else {
+ const int tail_node_id = edge.tail_nodes_[-f[j]];
+ int &right_edge = (*spans)[tail_node_id];
+ if (right_edge < 0)
+ right_edge = SourceEdgeCoveragesUsingTree(g, tail_node_id, k, spans, src_cov);
+ k = right_edge;
+ ++j;
+ }
+ }
+ }
+ return k;
+}
+
+void SourceEdgeCoveragesUsingTree(const Hypergraph& g,
+ vector >* src_cov) {
+ src_cov->clear();
+ src_cov->resize(g.edges_.size());
+ vector span_sizes(g.nodes_.size(), -1);
+ SourceEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, src_cov);
+}
+
+int TargetEdgeCoveragesUsingTree(const Hypergraph& g,
+ int node_id,
+ int span_start,
+ vector* spans,
+ vector >* trg_cov) {
+ const Hypergraph::Node& node = g.nodes_[node_id];
+ int k = -1;
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const int edge_id = node.in_edges_[i];
+ const Hypergraph::Edge& edge = g.edges_[edge_id];
+ set& cov = (*trg_cov)[edge_id];
+ int ntc = 0;
+ const vector& e = edge.rule_->f(); // rules are inverted
+ int j = 0;
+ k = span_start;
+ while (j < e.size()) {
+ if (e[j] > 0) {
+ cov.insert(k);
+ ++k;
+ ++j;
+ } else {
+ const int tail_node_id = edge.tail_nodes_[ntc];
+ ++ntc;
+ int &right_edge = (*spans)[tail_node_id];
+ if (right_edge < 0)
+ right_edge = TargetEdgeCoveragesUsingTree(g, tail_node_id, k, spans, trg_cov);
+ k = right_edge;
+ ++j;
+ }
+ }
+ // cerr << "node=" << node_id << ": k=" << k << endl;
+ }
+ return k;
+}
+
+void TargetEdgeCoveragesUsingTree(const Hypergraph& g,
+ vector >* trg_cov) {
+ trg_cov->clear();
+ trg_cov->resize(g.edges_.size());
+ vector span_sizes(g.nodes_.size(), -1);
+ TargetEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, trg_cov);
+}
+
+struct TransitionEventWeightFunction {
+ typedef SparseVector Result;
+ inline SparseVector operator()(const Hypergraph::Edge& e) const {
+ SparseVector result;
+ result.set_value(e.id_, e.edge_prob_);
+ return result;
+ }
+};
+
+inline void WriteProbGrid(const Array2D& m, ostream* pos) {
+ ostream& os = *pos;
+ char b[1024];
+ for (int i=0; i* edges) {
+ bool fix_up_src_spans = false;
+ if (k_best > 1 && edges) {
+ cerr << "ERROR: cannot request multiple best alignments and provide an edge set!\n";
+ abort();
+ }
+ if (map_instead_of_viterbi) {
+ if (k_best != 0) {
+ cerr << "WARNING: K-best alignment extraction not available for MAP, use --aligner_use_viterbi\n";
+ }
+ k_best = 1;
+ } else {
+ if (k_best == 0) k_best = 1;
+ }
+ const Hypergraph* g = &in_g;
+ HypergraphP new_hg;
+ if (!src_lattice.IsSentence() ||
+ !trg_lattice.IsSentence()) {
+ if (map_instead_of_viterbi) {
+ cerr << " Lattice alignment: using Viterbi instead of MAP alignment\n";
+ }
+ map_instead_of_viterbi = false;
+ fix_up_src_spans = !src_lattice.IsSentence();
+ }
+
+ KBest::KBestDerivations, ViterbiPathTraversal> kbest(in_g, k_best);
+ boost::scoped_ptr > kbest_edges;
+
+ for (int best = 0; best < k_best; ++best) {
+ const KBest::KBestDerivations, ViterbiPathTraversal>::Derivation* d = NULL;
+ if (!map_instead_of_viterbi) {
+ d = kbest.LazyKthBest(in_g.nodes_.size() - 1, best);
+ if (!d) break; // there are fewer than k_best derivations!
+ const vector& yield = d->yield;
+ kbest_edges.reset(new vector(in_g.edges_.size(), false));
+ for (int i = 0; i < yield.size(); ++i) {
+ assert(yield[i]->id_ < kbest_edges->size());
+ (*kbest_edges)[yield[i]->id_] = true;
+ }
+ }
+ if (!map_instead_of_viterbi || edges) {
+ if (kbest_edges) edges = kbest_edges.get();
+ new_hg = in_g.CreateViterbiHypergraph(edges);
+ for (int i = 0; i < new_hg->edges_.size(); ++i)
+ new_hg->edges_[i].edge_prob_ = prob_t::One();
+ g = new_hg.get();
+ }
+
+ vector edge_posteriors(g->edges_.size(), prob_t::Zero());
+ vector trg_sent;
+ vector src_sent;
+ if (fix_up_src_spans) {
+ ViterbiESentence(*g, &src_sent);
+ } else {
+ src_sent.resize(src_lattice.size());
+ for (int i = 0; i < src_sent.size(); ++i)
+ src_sent[i] = src_lattice[i][0].label;
+ }
+
+ ViterbiFSentence(*g, &trg_sent);
+
+ if (edges || !map_instead_of_viterbi) {
+ for (int i = 0; i < edge_posteriors.size(); ++i)
+ edge_posteriors[i] = prob_t::One();
+ } else {
+ SparseVector posts;
+ const prob_t z = InsideOutside, TransitionEventWeightFunction>(*g, &posts);
+ for (int i = 0; i < edge_posteriors.size(); ++i)
+ edge_posteriors[i] = posts.value(i) / z;
+ }
+ vector > src_cov(g->edges_.size());
+ vector > trg_cov(g->edges_.size());
+ TargetEdgeCoveragesUsingTree(*g, &trg_cov);
+
+ if (fix_up_src_spans)
+ SourceEdgeCoveragesUsingTree(*g, &src_cov);
+ else
+ SourceEdgeCoveragesUsingParseIndices(*g, &src_cov);
+
+ // figure out the src and reference size;
+ int src_size = src_sent.size();
+ int ref_size = trg_sent.size();
+ Array2D align(src_size + 1, ref_size, prob_t::Zero());
+ for (int c = 0; c < g->edges_.size(); ++c) {
+ const prob_t& p = edge_posteriors[c];
+ const set& srcs = src_cov[c];
+ const set& trgs = trg_cov[c];
+ for (set::const_iterator si = srcs.begin();
+ si != srcs.end(); ++si) {
+ for (set::const_iterator ti = trgs.begin();
+ ti != trgs.end(); ++ti) {
+ align(*si + 1, *ti) += p;
+ }
+ }
+ }
+ new_hg.reset();
+ //if (g != &in_g) { g.reset(); }
+
+ prob_t threshold(0.9);
+ const bool use_soft_threshold = true; // TODO configure
+
+ Array2D grid(src_size, ref_size, false);
+ for (int j = 0; j < ref_size; ++j) {
+ if (use_soft_threshold) {
+ threshold = prob_t::Zero();
+ for (int i = 0; i <= src_size; ++i)
+ if (align(i, j) > threshold) threshold = align(i, j);
+ //threshold *= prob_t(0.99);
+ }
+ for (int i = 0; i < src_size; ++i)
+ grid(i, j) = align(i+1, j) >= threshold;
+ }
+ if (out == &cout && k_best < 2) {
+ // TODO need to do some sort of verbose flag
+ WriteProbGrid(align, &cerr);
+ cerr << grid << endl;
+ }
+ (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| ";
+ AlignmentIO::SerializePharaohFormat(grid, out);
+ }
+};
+
diff --git a/decoder/aligner.h b/decoder/aligner.h
new file mode 100644
index 000000000..a34795c91
--- /dev/null
+++ b/decoder/aligner.h
@@ -0,0 +1,26 @@
+#ifndef _ALIGNER_H_
+
+#include
+#include
+#include
+#include "array2d.h"
+#include "lattice.h"
+
+class Hypergraph;
+class SentenceMetadata;
+
+struct AlignerTools {
+
+ // assumption: g contains derivations of input/ref and
+ // ONLY input/ref.
+ // if edges is non-NULL, the alignment corresponding to the edge rules will be written
+ static void WriteAlignment(const Lattice& src,
+ const Lattice& ref,
+ const Hypergraph& g,
+ std::ostream* out,
+ bool map_instead_of_viterbi = true,
+ int k_best = 0,
+ const std::vector* edges = NULL);
+};
+
+#endif
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
new file mode 100644
index 000000000..9f8bbeade
--- /dev/null
+++ b/decoder/apply_models.cc
@@ -0,0 +1,631 @@
+////TODO: keep model state in forest?
+
+//TODO: (for many nonterminals, or multi-rescoring pass) either global
+//best-first, or group by (NT,span) - use prev forest outside as a (admissable,
+//if models are a subset and weights are same) heuristic
+
+#include "apply_models.h"
+
+#include
+#include
+#ifndef HAVE_OLD_CPP
+# include
+# include
+#else
+# include
+# include
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
+
+#include
+
+#include "node_state_hash.h"
+#include "verbose.h"
+#include "hg.h"
+#include "ff.h"
+#include "ffset.h"
+
+#define NORMAL_CP 1
+#define FAST_CP 2
+#define FAST_CP_2 3
+
+using namespace std;
+
+struct Candidate;
+typedef SmallVectorInt JVector;
+typedef vector CandidateHeap;
+typedef vector CandidateList;
+
+// default vector size (* sizeof string is memory used)
+static const size_t kRESERVE_NUM_NODES = 500000ul;
+
+// life cycle: candidates are created, placed on the heap
+// and retrieved by their estimated cost, when they're
+// retrieved, they're incorporated into the +LM hypergraph
+// where they also know the head node index they are
+// attached to. After they are added to the +LM hypergraph
+// vit_prob_ and est_prob_ fields may be updated as better
+// derivations are found (this happens since the successor's
+// of derivation d may have a better score- they are
+// explored lazily). However, the updates don't happen
+// when a candidate is in the heap so maintaining the heap
+// property is not an issue.
+struct Candidate {
+ int node_index_; // -1 until incorporated
+ // into the +LM forest
+ const Hypergraph::Edge* in_edge_; // in -LM forest
+ Hypergraph::Edge out_edge_;
+ FFState state_;
+ const JVector j_;
+ prob_t vit_prob_; // these are fixed until the cand
+ // is popped, then they may be updated
+ prob_t est_prob_;
+
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j,
+ const Hypergraph& out_hg,
+ const vector& D,
+ const FFStates& node_states,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ bool is_goal) :
+ node_index_(-1),
+ in_edge_(&e),
+ j_(j) {
+ InitializeCandidate(out_hg, smeta, D, node_states, models, is_goal);
+ }
+
+ // used to query uniqueness
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j) : in_edge_(&e), j_(j) {}
+
+ bool IsIncorporatedIntoHypergraph() const {
+ return node_index_ >= 0;
+ }
+
+ void InitializeCandidate(const Hypergraph& out_hg,
+ const SentenceMetadata& smeta,
+ const vector >& D,
+ const FFStates& node_states,
+ const ModelSet& models,
+ const bool is_goal) {
+ const Hypergraph::Edge& in_edge = *in_edge_;
+ out_edge_.rule_ = in_edge.rule_;
+ out_edge_.feature_values_ = in_edge.feature_values_;
+ out_edge_.i_ = in_edge.i_;
+ out_edge_.j_ = in_edge.j_;
+ out_edge_.prev_i_ = in_edge.prev_i_;
+ out_edge_.prev_j_ = in_edge.prev_j_;
+ Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_;
+ tail.resize(j_.size());
+ prob_t p = prob_t::One();
+ // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl;
+ for (int i = 0; i < tail.size(); ++i) {
+ const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]];
+ assert(ant.IsIncorporatedIntoHypergraph());
+ tail[i] = ant.node_index_;
+ p *= ant.vit_prob_;
+ }
+ prob_t edge_estimate = prob_t::One();
+ if (is_goal) {
+ assert(tail.size() == 1);
+ const FFState& ant_state = node_states[tail.front()];
+ models.AddFinalFeatures(ant_state, &out_edge_, smeta);
+ } else {
+ models.AddFeaturesToEdge(smeta, out_hg, node_states, &out_edge_, &state_, &edge_estimate);
+ }
+ vit_prob_ = out_edge_.edge_prob_ * p;
+ est_prob_ = vit_prob_ * edge_estimate;
+ }
+};
+
+ostream& operator<<(ostream& os, const Candidate& cand) {
+ os << "CAND[";
+ if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; }
+ else { os << "+LM_node=" << cand.node_index_; }
+ os << " edge=" << cand.in_edge_->id_;
+ os << " j=<";
+ for (int i = 0; i < cand.j_.size(); ++i)
+ os << (i==0 ? "" : " ") << cand.j_[i];
+ os << "> vit=" << log(cand.vit_prob_);
+ os << " est=" << log(cand.est_prob_);
+ return os << ']';
+}
+
+struct HeapCandCompare {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ < r->est_prob_;
+ }
+};
+
+struct EstProbSorter {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ > r->est_prob_;
+ }
+};
+
+// the same candidate can be added multiple times if
+// j is multidimensional (if you're going NW in Manhattan, you
+// can first go north, then west, or you can go west then north)
+// this is a hash function on the relevant variables from
+// Candidate to enforce this.
+struct CandidateUniquenessHash {
+ size_t operator()(const Candidate* c) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ c->in_edge_->id_;
+ for (int i = 0; i < c->j_.size(); ++i)
+ x = ((x << 5) + x) ^ c->j_[i];
+ return x;
+ }
+};
+
+struct CandidateUniquenessEquals {
+ bool operator()(const Candidate* a, const Candidate* b) const {
+ return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_);
+ }
+};
+
+typedef unordered_set UniqueCandidateSet;
+typedef unordered_map > State2Node;
+
+class CubePruningRescorer {
+
+public:
+ CubePruningRescorer(const ModelSet& m,
+ const SentenceMetadata& sm,
+ const Hypergraph& i,
+ int pop_limit,
+ Hypergraph* o,
+ int s = NORMAL_CP ) :
+ models(m),
+ smeta(sm),
+ in(i),
+ out(*o),
+ D(in.nodes_.size()),
+ pop_limit_(pop_limit),
+ strategy_(s){
+ if (!SILENT) cerr << " Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl;
+ node_states_.reserve(kRESERVE_NUM_NODES);
+ }
+
+ void Apply() {
+ int num_nodes = in.nodes_.size();
+ assert(num_nodes >= 2);
+ int goal_id = num_nodes - 1;
+ int pregoal = goal_id - 1;
+ assert(in.nodes_[pregoal].out_edges_.size() == 1);
+ if (!SILENT) cerr << " ";
+ int has = 0;
+ for (int i = 0; i < in.nodes_.size(); ++i) {
+ if (!SILENT) {
+ int needs = (50 * i / in.nodes_.size());
+ while (has < needs) { cerr << '.'; ++has; }
+ }
+ if (strategy_==NORMAL_CP){
+ KBest(i, i == goal_id);
+ }
+ if (strategy_==FAST_CP){
+ KBestFast(i, i == goal_id);
+ }
+ if (strategy_==FAST_CP_2){
+ KBestFast2(i, i == goal_id);
+ }
+ }
+ if (!SILENT) {
+ cerr << endl;
+ cerr << " Best path: " << log(D[goal_id].front()->vit_prob_)
+ << "\t" << log(D[goal_id].front()->est_prob_) << endl;
+ }
+ out.PruneUnreachable(D[goal_id].front()->node_index_);
+ FreeAll();
+ }
+
+ private:
+ void FreeAll() {
+ for (int i = 0; i < D.size(); ++i) {
+ CandidateList& D_i = D[i];
+ for (int j = 0; j < D_i.size(); ++j)
+ delete D_i[j];
+ }
+ D.clear();
+ }
+
+ void IncorporateIntoPlusLMForest(size_t head_node_hash, Candidate* item, State2Node* s2n, CandidateList* freelist) {
+ Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_);
+ new_edge->edge_prob_ = item->out_edge_.edge_prob_;
+ Candidate*& o_item = (*s2n)[item->state_];
+ if (!o_item) o_item = item;
+
+ int& node_id = o_item->node_index_;
+ if (node_id < 0) {
+ Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_);
+ new_node->node_hash = cdec::HashNode(head_node_hash, item->state_); // ID is combination of existing state + residual state
+ node_states_.push_back(item->state_);
+ node_id = new_node->id_;
+ }
+#if 0
+ Hypergraph::Node* node = &out.nodes_[node_id];
+ out.ConnectEdgeToHeadNode(new_edge, node);
+#else
+ out.ConnectEdgeToHeadNode(new_edge, node_id);
+#endif
+ // update candidate if we have a better derivation
+ // note: the difference between the vit score and the estimated
+ // score is the same for all items with a common residual DP
+ // state
+ if (item->vit_prob_ > o_item->vit_prob_) {
+ assert(o_item->state_ == item->state_); // sanity check!
+ o_item->est_prob_ = item->est_prob_;
+ o_item->vit_prob_ = item->vit_prob_;
+ }
+ if (item != o_item) freelist->push_back(item);
+ }
+
+ void KBest(const int vert_index, const bool is_goal) {
+ // cerr << "KBest(" << vert_index << ")\n";
+ CandidateList& D_v = D[vert_index];
+ assert(D_v.empty());
+ const Hypergraph::Node& v = in.nodes_[vert_index];
+ // cerr << " has " << v.in_edges_.size() << " in-coming edges\n";
+ const vector& in_edges = v.in_edges_;
+ CandidateHeap cand;
+ CandidateList freelist;
+ cand.reserve(in_edges.size());
+ UniqueCandidateSet unique_cands;
+ for (int i = 0; i < in_edges.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
+ const JVector j(edge.tail_nodes_.size(), 0);
+ cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
+ bool is_new = unique_cands.insert(cand.back()).second;
+ assert(is_new); // these should all be unique!
+ }
+// cerr << " making heap of " << cand.size() << " candidates\n";
+ make_heap(cand.begin(), cand.end(), HeapCandCompare());
+ State2Node state2node; // "buf" in Figure 2
+ int pops = 0;
+ while(!cand.empty() && pops < pop_limit_) {
+ pop_heap(cand.begin(), cand.end(), HeapCandCompare());
+ Candidate* item = cand.back();
+ cand.pop_back();
+ // cerr << "POPPED: " << *item << endl;
+ PushSucc(*item, is_goal, &cand, &unique_cands);
+ IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist);
+ ++pops;
+ }
+ D_v.resize(state2node.size());
+ int c = 0;
+ for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i)
+ D_v[c++] = i->second;
+ sort(D_v.begin(), D_v.end(), EstProbSorter());
+ // cerr << " expanded to " << D_v.size() << " nodes\n";
+
+ for (int i = 0; i < cand.size(); ++i)
+ delete cand[i];
+ // freelist is necessary since even after an item merged, it still stays in
+ // the unique set so it can't be deleted til now
+ for (int i = 0; i < freelist.size(); ++i)
+ delete freelist[i];
+ }
+
+ void KBestFast(const int vert_index, const bool is_goal) {
+ // cerr << "KBest(" << vert_index << ")\n";
+ CandidateList& D_v = D[vert_index];
+ assert(D_v.empty());
+ const Hypergraph::Node& v = in.nodes_[vert_index];
+ // cerr << " has " << v.in_edges_.size() << " in-coming edges\n";
+ const vector& in_edges = v.in_edges_;
+ CandidateHeap cand;
+ CandidateList freelist;
+ cand.reserve(in_edges.size());
+ //init with j<0,0> for all rules-edges that lead to node-(NT-span)
+ for (int i = 0; i < in_edges.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
+ const JVector j(edge.tail_nodes_.size(), 0);
+ cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
+ }
+ // cerr << " making heap of " << cand.size() << " candidates\n";
+ make_heap(cand.begin(), cand.end(), HeapCandCompare());
+ State2Node state2node; // "buf" in Figure 2
+ int pops = 0;
+ while(!cand.empty() && pops < pop_limit_) {
+ pop_heap(cand.begin(), cand.end(), HeapCandCompare());
+ Candidate* item = cand.back();
+ cand.pop_back();
+ // cerr << "POPPED: " << *item << endl;
+
+ PushSuccFast(*item, is_goal, &cand);
+ IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist);
+ ++pops;
+ }
+ D_v.resize(state2node.size());
+ int c = 0;
+ for (auto& i : state2node) {
+ D_v[c++] = i.second;
+ // cerr << "MERGED: " << *i.second << endl;
+ }
+ //cerr <<"Node id: "<< vert_index<< endl;
+ //#ifdef MEASURE_CA
+ // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<& in_edges = v.in_edges_;
+ CandidateHeap cand;
+ CandidateList freelist;
+ cand.reserve(in_edges.size());
+ UniqueCandidateSet unique_accepted;
+ //init with j<0,0> for all rules-edges that lead to node-(NT-span)
+ for (int i = 0; i < in_edges.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
+ const JVector j(edge.tail_nodes_.size(), 0);
+ cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
+ }
+ // cerr << " making heap of " << cand.size() << " candidates\n";
+ make_heap(cand.begin(), cand.end(), HeapCandCompare());
+ State2Node state2node; // "buf" in Figure 2
+ int pops = 0;
+ while(!cand.empty() && pops < pop_limit_) {
+ pop_heap(cand.begin(), cand.end(), HeapCandCompare());
+ Candidate* item = cand.back();
+ cand.pop_back();
+ bool is_new = unique_accepted.insert(item).second;
+ assert(is_new); // these should all be unique!
+ // cerr << "POPPED: " << *item << endl;
+
+ PushSuccFast2(*item, is_goal, &cand, &unique_accepted);
+ IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist);
+ ++pops;
+ }
+ D_v.resize(state2node.size());
+ int c = 0;
+ for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i){
+ D_v[c++] = i->second;
+ // cerr << "MERGED: " << *i->second << endl;
+ }
+ //cerr <<"Node id: "<< vert_index<< endl;
+ //#ifdef MEASURE_CA
+ // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<tail_nodes_[i]].size()) {
+ Candidate query_unique(*item.in_edge_, j);
+ if (cs->count(&query_unique) == 0) {
+ Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
+ cand.push_back(new_cand);
+ push_heap(cand.begin(), cand.end(), HeapCandCompare());
+ bool is_new = cs->insert(new_cand).second;
+ assert(is_new); // insert into uniqueness set, sanity check
+ }
+ }
+ }
+ }
+
+ //PushSucc following unique ancestor generation function
+ void PushSuccFast(const Candidate& item, const bool is_goal, CandidateHeap* pcand){
+ CandidateHeap& cand = *pcand;
+ for (int i = 0; i < item.j_.size(); ++i) {
+ JVector j = item.j_;
+ ++j[i];
+ if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) {
+ Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
+ cand.push_back(new_cand);
+ push_heap(cand.begin(), cand.end(), HeapCandCompare());
+ }
+ if(item.j_[i]!=0){
+ return;
+ }
+ }
+ }
+
+ //PushSucc only if all ancest Cand are added
+ void PushSuccFast2(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* ps){
+ CandidateHeap& cand = *pcand;
+ for (int i = 0; i < item.j_.size(); ++i) {
+ JVector j = item.j_;
+ ++j[i];
+ if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) {
+ Candidate query_unique(*item.in_edge_, j);
+ if (HasAllAncestors(&query_unique,ps)) {
+ Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
+ cand.push_back(new_cand);
+ push_heap(cand.begin(), cand.end(), HeapCandCompare());
+ }
+ }
+ }
+ }
+
+ bool HasAllAncestors(const Candidate* item, UniqueCandidateSet* cs){
+ for (int i = 0; i < item->j_.size(); ++i) {
+ JVector j = item->j_;
+ --j[i];
+ if (j[i] >=0) {
+ Candidate query_unique(*item->in_edge_, j);
+ if (cs->count(&query_unique) == 0) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ const ModelSet& models;
+ const SentenceMetadata& smeta;
+ const Hypergraph& in;
+ Hypergraph& out;
+
+ vector D; // maps nodes in in-HG to the
+ // equivalent nodes (many due to state
+ // splits) in the out-HG.
+ FFStates node_states_; // for each node in the out-HG what is
+ // its q function value?
+ const int pop_limit_;
+ const int strategy_; //switch Cube Pruning strategy: 1 normal, 2 fast (alg 2), 3 fast_2 (alg 3). (see: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010)
+};
+
+struct NoPruningRescorer {
+ NoPruningRescorer(const ModelSet& m, const SentenceMetadata &sm, const Hypergraph& i, Hypergraph* o) :
+ models(m),
+ smeta(sm),
+ in(i),
+ out(*o),
+ nodemap(i.nodes_.size()) {
+ if (!SILENT) cerr << " Rescoring forest (full intersection)\n";
+ node_states_.reserve(kRESERVE_NUM_NODES);
+ }
+
+ typedef unordered_map > State2NodeIndex;
+
+ void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, size_t head_node_hash, State2NodeIndex* state2node) {
+ const int arity = in_edge.Arity();
+ Hypergraph::TailNodeVector ends(arity);
+ for (int i = 0; i < arity; ++i)
+ ends[i] = nodemap[in_edge.tail_nodes_[i]].size();
+
+ Hypergraph::TailNodeVector tail_iter(arity, 0);
+ bool done = false;
+ while (!done) {
+ Hypergraph::TailNodeVector tail(arity);
+ for (int i = 0; i < arity; ++i)
+ tail[i] = nodemap[in_edge.tail_nodes_[i]][tail_iter[i]];
+ Hypergraph::Edge* new_edge = out.AddEdge(in_edge, tail);
+ FFState head_state;
+ if (is_goal) {
+ assert(tail.size() == 1);
+ const FFState& ant_state = node_states_[tail.front()];
+ models.AddFinalFeatures(ant_state, new_edge,smeta);
+ } else {
+ prob_t edge_estimate; // this is a full intersection, so we disregard this
+ models.AddFeaturesToEdge(smeta, out, node_states_, new_edge, &head_state, &edge_estimate);
+ }
+ int& head_plus1 = (*state2node)[head_state];
+ if (!head_plus1) {
+ HG::Node* new_node = out.AddNode(in_edge.rule_->GetLHS());
+ new_node->node_hash = cdec::HashNode(head_node_hash, head_state); // ID is combination of existing state + residual state
+ head_plus1 = new_node->id_ + 1;
+ node_states_.push_back(head_state);
+ nodemap[in_edge.head_node_].push_back(head_plus1 - 1);
+ }
+ const int head_index = head_plus1 - 1;
+ out.ConnectEdgeToHeadNode(new_edge->id_, head_index);
+
+ int ii = 0;
+ for (; ii < arity; ++ii) {
+ ++tail_iter[ii];
+ if (tail_iter[ii] < ends[ii]) break;
+ tail_iter[ii] = 0;
+ }
+ done = (ii == arity);
+ }
+ }
+
+ void ProcessOneNode(const int node_num, const bool is_goal) {
+ State2NodeIndex state2node;
+ const Hypergraph::Node& node = in.nodes_[node_num];
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]];
+ ExpandEdge(edge, is_goal, node.node_hash, &state2node);
+ }
+ }
+
+ void Apply() {
+ int num_nodes = in.nodes_.size();
+ int goal_id = num_nodes - 1;
+ int pregoal = goal_id - 1;
+ assert(in.nodes_[pregoal].out_edges_.size() == 1);
+ if (!SILENT) cerr << " ";
+ int has = 0;
+ for (int i = 0; i < in.nodes_.size(); ++i) {
+ if (!SILENT) {
+ int needs = (50 * i / in.nodes_.size());
+ while (has < needs) { cerr << '.'; ++has; }
+ }
+ ProcessOneNode(i, i == goal_id);
+ }
+ if (!SILENT) cerr << endl;
+ }
+
+ private:
+ const ModelSet& models;
+ const SentenceMetadata& smeta;
+ const Hypergraph& in;
+ Hypergraph& out;
+
+ vector > nodemap;
+ FFStates node_states_; // for each node in the out-HG what is
+ // its q function value?
+};
+
+// each node in the graph has one of these, it keeps track of
+void ApplyModelSet(const Hypergraph& in,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ const IntersectionConfiguration& config,
+ Hypergraph* out) {
+ //force exhaustive if there's no state req. for model
+ if (models.stateless() || config.algorithm == IntersectionConfiguration::FULL) {
+ NoPruningRescorer ma(models, smeta, in, out); // avoid overhead of best-first when no state
+ ma.Apply();
+ } else if (config.algorithm == IntersectionConfiguration::CUBE
+ || config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING
+ || config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING_2) {
+ int pl = config.pop_limit;
+ const int max_pl_for_large=50;
+ if (pl > max_pl_for_large && in.nodes_.size() > 80000) {
+ pl = max_pl_for_large;
+ cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n";
+ }
+ if (config.algorithm == IntersectionConfiguration::CUBE) {
+ CubePruningRescorer ma(models, smeta, in, pl, out);
+ ma.Apply();
+ }
+ else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING){
+ CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP);
+ ma.Apply();
+ }
+ else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING_2){
+ CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP_2);
+ ma.Apply();
+ }
+
+ } else {
+ cerr << "Don't understand intersection algorithm " << config.algorithm << endl;
+ exit(1);
+ }
+ out->is_linear_chain_ = in.is_linear_chain_; // TODO remove when this is computed
+ // automatically
+}
+
diff --git a/decoder/apply_models.h b/decoder/apply_models.h
new file mode 100644
index 000000000..19a4c7be2
--- /dev/null
+++ b/decoder/apply_models.h
@@ -0,0 +1,43 @@
+#ifndef _APPLY_MODELS_H_
+#define _APPLY_MODELS_H_
+
+#include
+
+struct ModelSet;
+struct Hypergraph;
+struct SentenceMetadata;
+
+struct exhaustive_t {};
+
+struct IntersectionConfiguration {
+enum {
+ FULL,
+ CUBE,
+ FAST_CUBE_PRUNING,
+ FAST_CUBE_PRUNING_2,
+ N_ALGORITHMS
+};
+
+ const int algorithm; // 0 = full intersection, 1 = cube pruning
+ const int pop_limit; // max number of pops off the heap at each node
+ IntersectionConfiguration(int alg, int k) : algorithm(alg), pop_limit(k) {}
+ IntersectionConfiguration(exhaustive_t /* t */) : algorithm(0), pop_limit() {}
+};
+
+inline std::ostream& operator<<(std::ostream& os, const IntersectionConfiguration& c) {
+ if (c.algorithm == 0) { os << "FULL"; }
+ else if (c.algorithm == 1) { os << "CUBE:k=" << c.pop_limit; }
+ else if (c.algorithm == 2) { os << "FAST_CUBE_PRUNING"; }
+ else if (c.algorithm == 3) { os << "FAST_CUBE_PRUNING_2"; }
+ else if (c.algorithm == 4) { os << "N_ALGORITHMS"; }
+ else os << "OTHER";
+ return os;
+}
+
+void ApplyModelSet(const Hypergraph& in,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ const IntersectionConfiguration& config,
+ Hypergraph* out);
+
+#endif
diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc
new file mode 100644
index 000000000..b30f1ec69
--- /dev/null
+++ b/decoder/bottom_up_parser.cc
@@ -0,0 +1,370 @@
+//TODO: when using many nonterminals, group passive edges for a span (treat all as a single X for the active items).
+
+//TODO: figure out what cdyer was talking about when he said that having unary rules A->B and B->A, doesn't make cycles appear in result provided rules are sorted in some way (that they typically are)
+
+#include "bottom_up_parser.h"
+
+#include
+#include