emnlp code upload

HKUST-KnowComp · Oct 9, 2020 · 7547de1 · 7547de1
1 parent aa67bc2
commit 7547de1
Show file tree

Hide file tree

Showing 19 changed files with 4,968 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/config/bert.cfg b/config/bert.cfg
@@ -0,0 +1,28 @@
+;TIP: one can comment lines in this config format by adding a ; at the start of a line
+
+[data]
+
+pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
+context = /home/shared/hearst_full_context/
+context_oov = /home/shared/context/
+bert_path = /home/shared/pretrained-lm/bert/bert_base_uncased/ 
+ckpt = /home/cyuaq/comHyper/checkpoints_binary
+
+
+[hyperparameters]
+
+model = bert_base
+svd_dimension = 50
+number_hidden_layers = 2
+hidden_layer_size = 300
+batch_size = 32
+negative_num = 1
+max_epochs = 500
+learning_rate = 0.00001
+weight_decay = 0
+
+context_num = 10
+context_len = 10
+max_seq_length = 64
+
+gpu_device = 0,1,2,3
diff --git a/config/bert_large.cfg b/config/bert_large.cfg
@@ -0,0 +1,28 @@
+;TIP: one can comment lines in this config format by adding a ; at the start of a line
+
+[data]
+
+pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
+context = /home/shared/hearst_full_context/
+context_oov = /home/shared/context/
+bert_path = /home/shared/pretrained-lm/bert/bert_large_uncased/ 
+ckpt = /home/cyuaq/compHyper/checkpoints_binary
+
+
+[hyperparameters]
+
+model = bert_large
+svd_dimension = 50
+number_hidden_layers = 2
+hidden_layer_size = 300
+batch_size = 8
+negative_num = 1
+max_epochs = 500
+learning_rate = 0.00001
+weight_decay = 0
+
+context_num = 10
+context_len = 10
+max_seq_length = 64
+
+gpu_device = 0,1,2,3
diff --git a/config/context.cfg b/config/context.cfg
@@ -0,0 +1,26 @@
+;TIP: one can comment lines in this config format by adding a ; at the start of a line
+
+[data]
+
+pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
+context = /home/shared/task1_hearst_full_context/
+context_oov = /home/shared/task1_context/
+ckpt = /home/cyuaq/compHyper/checkpoints_context
+
+
+[hyperparameters]
+
+model = han
+svd_dimension = 50
+number_hidden_layers = 2
+hidden_layer_size = 300
+batch_size = 32
+negative_num = 1
+max_epochs = 500
+learning_rate = 0.001
+weight_decay = 0
+
+context_num = 10
+context_len = 10
+
+gpu_device = 3
diff --git a/config/word.cfg b/config/word.cfg
@@ -0,0 +1,25 @@
+;TIP: one can comment lines in this config format by adding a ; at the start of a line
+
+[data]
+
+
+; two training data files have to be aligned (the two vectors of the same word in the same line)
+
+pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
+context = /home/shared/context 
+ckpt = /home/cyuaq/compHyper/checkpoints_word/
+
+
+[hyperparameters]
+
+model = mlp_unisample_svd
+svd_dimension = 50
+number_hidden_layers = 2
+hidden_layer_size = 300
+batch_size = 128
+negative_num = 400
+max_epochs = 500
+learning_rate = 0.001
+weight_decay = 0
+
+gpu_device = 3
diff --git a/data/download_data.sh b/data/download_data.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# -------------------------------------------------------------------------------
+# This shell script downloads and preprocesses all the datasets
+# -------------------------------------------------------------------------------
+
+# Directly from the repo: https://github.com/facebookresearch/hypernymysuite/blob/master/download_data.sh
+
+# Immediately quit on error
+set -e
+
+# if you have any proxies, etc., put them here
+CURL_OPTIONS="-s"
+
+
+# URLS of each of the different datasets
+OMER_URL="http://u.cs.biu.ac.il/~nlp/wp-content/uploads/lexical_inference.zip"
+SHWARTZ_URL="https://raw.githubusercontent.com/vered1986/HypeNET/v2/dataset/datasets.rar"
+VERED_REPO_URL="https://raw.githubusercontent.com/vered1986/UnsupervisedHypernymy/e3b22709365c7b3042126e5887c9baa03631354e/datasets"
+KIMANH_REPO_URL="https://raw.githubusercontent.com/nguyenkh/HyperVec/bd2cb15a6be2a4726ffbf9c0d7e742144790dee3/datasets_classification"
+HYPERLEX_URL="https://raw.githubusercontent.com/ivulic/hyperlex/master/hyperlex-data.zip"
+
+function warning () {
+    echo "$1" >&2
+}
+
+get_seeded_random()
+{
+  seed="$1"
+  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
+    </dev/zero 2>/dev/null
+}
+
+function deterministic_shuffle () {
+    # sort randomly but with a predictable seed
+    sort --random-sort --random-source=<(get_seeded_random 42)
+}
+
+function download_hyperlex () {
+    TMPFILE="$(mktemp)"
+    TMPDIRE="$(mktemp -d)"
+    curl $CURL_OPTIONS "$HYPERLEX_URL" > "$TMPFILE"
+    unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null
+
+    echo -e 'word1\tword2\tpos\tlabel\tscore\tfold'
+    grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_training_all_random.txt" | \
+        cut -d' ' -f1-5 | tr ' ' '\t' | \
+        awk -F'\t' '$0=$0"\ttrain"'
+    grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_dev_all_random.txt" | \
+        cut -d' ' -f1-5 | tr ' ' '\t' | \
+        awk -F'\t' '$0=$0"\tval"'
+    grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_test_all_random.txt" | \
+        cut -d' ' -f1-5 | tr ' ' '\t' | \
+        awk -F'\t' '$0=$0"\ttest"'
+
+    rm -rf "$TMPFILE" "$TMPDIRE"
+}
+
+function download_bless () {
+    TMPFILE="$(mktemp)"
+    TMPDIRE="$(mktemp -d)"
+    curl $CURL_OPTIONS "$OMER_URL" > "$TMPFILE"
+    unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null
+
+    echo -e 'word1\tword2\tlabel\tfold'
+    cat "${TMPDIRE}/lexical_entailment/bless2011/data_rnd_test.tsv" \
+        "${TMPDIRE}/lexical_entailment/bless2011/data_rnd_train.tsv" \
+        "${TMPDIRE}/lexical_entailment/bless2011/data_rnd_val.tsv" | \
+        tr -d '\15' | \
+        deterministic_shuffle | \
+        awk '{if (NR < 1454) {print $0 "\tval"} else {print $0 "\ttest"}}'
+
+    rm -rf "$TMPFILE" "$TMPDIRE"
+}
+
+function download_leds () {
+    TMPFILE="$(mktemp)"
+    TMPDIRE="$(mktemp -d)"
+    curl $CURL_OPTIONS "$OMER_URL" > "$TMPFILE"
+    unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null
+
+    echo -e 'word1\tword2\tlabel\tfold'
+    cat "${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_test.tsv" \
+        "${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_train.tsv" \
+        "${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_val.tsv" | \
+        tr -d '\15' | \
+        deterministic_shuffle | \
+        awk '{if (NR < 276) {print $0 "\tval"} else {print $0 "\ttest"}}'
+
+    rm -rf "$TMPFILE" "$TMPDIRE"
+}
+
+function download_shwartz () {
+    TMPFILE="$(mktemp)"
+    TMPDIRE="$(mktemp -d)"
+    curl $CURL_OPTIONS "$SHWARTZ_URL" > "$TMPFILE"
+
+    unrar x "$TMPFILE" "$TMPDIRE" >/dev/null
+    echo -e 'word1\tword2\tlabel\tfold'
+    cat "$TMPDIRE/dataset_rnd/train.tsv" \
+        "$TMPDIRE/dataset_rnd/test.tsv" \
+        "$TMPDIRE/dataset_rnd/val.tsv" | \
+        grep -v ' ' | \
+        deterministic_shuffle | \
+        awk '{if (NR < 5257) {print $0 "\tval"} else {print $0 "\ttest"}}'
+
+    rm -rf "$TMPFILE" "$TMPDIRE"
+}
+
+function download_bibless () {
+    echo -e 'word1\tword2\trelation\tlabel'
+    curl $CURL_OPTIONS "$KIMANH_REPO_URL/ABIBLESS.txt" | \
+        cut -f1,2,4 | \
+        awk -F'\t' '{if ($3 == "hyper") {print $0 "\t1"} else if ($3 == "other") {print $0 "\t0"} else {print $0 "\t-1"}}'
+}
+
+function download_wbless () {
+    echo -e 'word1\tword2\tlabel\trelation\tfold'
+    curl $CURL_OPTIONS "$KIMANH_REPO_URL/AWBLESS.txt" | \
+        deterministic_shuffle | \
+        awk '{if (NR < 168) {print $0 "\tval"} else {print $0 "\ttest"}}'
+}
+
+function download_eval () {
+    echo -e 'word1\tword2\tlabel\trelation\tfold'
+    curl $CURL_OPTIONS "$VERED_REPO_URL/EVALution.val" "$VERED_REPO_URL/EVALution.test" | \
+        sort | uniq | sed 's/-[jvn]\t/\t/g' | \
+        deterministic_shuffle | \
+        awk '{if (NR < 737) {print $0 "\tval"} else {print $0 "\ttest"}}'
+}
+
+
+# Let the user specify output directory, default to `data`
+# Ex: `HYPERNYMY_DATA_OUTPUT=.my_data_dir bash download_data.sh`
+if [ -z $HYPERNYMY_DATA_OUTPUT ]; then
+  HYPERNYMY_DATA_OUTPUT="data"
+fi
+
+if [ -d "$HYPERNYMY_DATA_OUTPUT" ]
+then
+    echo "Warning: Already found the data. Please run 'rm -rf $HYPERNYMY_DATA_OUTPUT'" >&2
+    exit 1
+fi
+
+if [ ! -x "$(command -v unrar)" ]
+then
+    warning "This script requires the 'unrar' tool. Please run"
+    warning "  brew install unrar"
+    warning "or whatever your system's equivalent is."
+    exit 1
+fi
+
+if [ ! -x "$(command -v openssl)" ]
+then
+    warning "This script requires the 'openssl' tool. Please run"
+    warning "  brew install unrar"
+    warning "or whatever your system's equivalent is."
+    exit 1
+fi
+
+
+
+# prep the output folder
+mkdir -p "$HYPERNYMY_DATA_OUTPUT"
+
+
+warning "[1/7] Downloading BLESS"
+download_bless > "$HYPERNYMY_DATA_OUTPUT/bless.tsv"
+
+warning "[2/7] Downloading LEDS"
+download_leds > "$HYPERNYMY_DATA_OUTPUT/leds.tsv"
+
+warning "[3/7] Downloading EVAL"
+download_eval > "$HYPERNYMY_DATA_OUTPUT/eval.tsv"
+
+warning "[4/7] Downloading Shwartz"
+download_shwartz > "$HYPERNYMY_DATA_OUTPUT/shwartz.tsv"
+
+warning "[5/7] Downloading Hyperlex"
+download_hyperlex > "$HYPERNYMY_DATA_OUTPUT/hyperlex_rnd.tsv"
+
+warning "[6/7] Downloading WBLESS"
+download_wbless > "$HYPERNYMY_DATA_OUTPUT/wbless.tsv"
+
+warning "[7/7] Downloading BiBLESS"
+download_bibless > "$HYPERNYMY_DATA_OUTPUT/bibless.tsv"
+
+warning "All done."