Skip to content

Commit

Permalink
emnlp code upload
Browse files Browse the repository at this point in the history
  • Loading branch information
changlongyu committed Oct 9, 2020
1 parent aa67bc2 commit 7547de1
Show file tree
Hide file tree
Showing 19 changed files with 4,968 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
28 changes: 28 additions & 0 deletions config/bert.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
;TIP: one can comment lines in this config format by adding a ; at the start of a line

[data]

pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
context = /home/shared/hearst_full_context/
context_oov = /home/shared/context/
bert_path = /home/shared/pretrained-lm/bert/bert_base_uncased/
ckpt = /home/cyuaq/comHyper/checkpoints_binary


[hyperparameters]

model = bert_base
svd_dimension = 50
number_hidden_layers = 2
hidden_layer_size = 300
batch_size = 32
negative_num = 1
max_epochs = 500
learning_rate = 0.00001
weight_decay = 0

context_num = 10
context_len = 10
max_seq_length = 64

gpu_device = 0,1,2,3
28 changes: 28 additions & 0 deletions config/bert_large.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
;TIP: one can comment lines in this config format by adding a ; at the start of a line

[data]

pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
context = /home/shared/hearst_full_context/
context_oov = /home/shared/context/
bert_path = /home/shared/pretrained-lm/bert/bert_large_uncased/
ckpt = /home/cyuaq/compHyper/checkpoints_binary


[hyperparameters]

model = bert_large
svd_dimension = 50
number_hidden_layers = 2
hidden_layer_size = 300
batch_size = 8
negative_num = 1
max_epochs = 500
learning_rate = 0.00001
weight_decay = 0

context_num = 10
context_len = 10
max_seq_length = 64

gpu_device = 0,1,2,3
26 changes: 26 additions & 0 deletions config/context.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
;TIP: one can comment lines in this config format by adding a ; at the start of a line

[data]

pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
context = /home/shared/task1_hearst_full_context/
context_oov = /home/shared/task1_context/
ckpt = /home/cyuaq/compHyper/checkpoints_context


[hyperparameters]

model = han
svd_dimension = 50
number_hidden_layers = 2
hidden_layer_size = 300
batch_size = 32
negative_num = 1
max_epochs = 500
learning_rate = 0.001
weight_decay = 0

context_num = 10
context_len = 10

gpu_device = 3
25 changes: 25 additions & 0 deletions config/word.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
;TIP: one can comment lines in this config format by adding a ; at the start of a line

[data]


; two training data files have to be aligned (the two vectors of the same word in the same line)

pattern_filename=/home/shared/hypernymysuite/hearst_counts.txt.gz
context = /home/shared/context
ckpt = /home/cyuaq/compHyper/checkpoints_word/


[hyperparameters]

model = mlp_unisample_svd
svd_dimension = 50
number_hidden_layers = 2
hidden_layer_size = 300
batch_size = 128
negative_num = 400
max_epochs = 500
learning_rate = 0.001
weight_decay = 0

gpu_device = 3
194 changes: 194 additions & 0 deletions data/download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#!/bin/bash
#
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
# -------------------------------------------------------------------------------
# This shell script downloads and preprocesses all the datasets
# -------------------------------------------------------------------------------

# Directly from the repo: https://github.com/facebookresearch/hypernymysuite/blob/master/download_data.sh

# Immediately quit on error
set -e

# if you have any proxies, etc., put them here
CURL_OPTIONS="-s"


# URLS of each of the different datasets
OMER_URL="http://u.cs.biu.ac.il/~nlp/wp-content/uploads/lexical_inference.zip"
SHWARTZ_URL="https://raw.githubusercontent.com/vered1986/HypeNET/v2/dataset/datasets.rar"
VERED_REPO_URL="https://raw.githubusercontent.com/vered1986/UnsupervisedHypernymy/e3b22709365c7b3042126e5887c9baa03631354e/datasets"
KIMANH_REPO_URL="https://raw.githubusercontent.com/nguyenkh/HyperVec/bd2cb15a6be2a4726ffbf9c0d7e742144790dee3/datasets_classification"
HYPERLEX_URL="https://raw.githubusercontent.com/ivulic/hyperlex/master/hyperlex-data.zip"

function warning () {
echo "$1" >&2
}

get_seeded_random()
{
seed="$1"
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
</dev/zero 2>/dev/null
}

function deterministic_shuffle () {
# sort randomly but with a predictable seed
sort --random-sort --random-source=<(get_seeded_random 42)
}

function download_hyperlex () {
TMPFILE="$(mktemp)"
TMPDIRE="$(mktemp -d)"
curl $CURL_OPTIONS "$HYPERLEX_URL" > "$TMPFILE"
unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null

echo -e 'word1\tword2\tpos\tlabel\tscore\tfold'
grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_training_all_random.txt" | \
cut -d' ' -f1-5 | tr ' ' '\t' | \
awk -F'\t' '$0=$0"\ttrain"'
grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_dev_all_random.txt" | \
cut -d' ' -f1-5 | tr ' ' '\t' | \
awk -F'\t' '$0=$0"\tval"'
grep -v WORD1 "$TMPDIRE/splits/random/hyperlex_test_all_random.txt" | \
cut -d' ' -f1-5 | tr ' ' '\t' | \
awk -F'\t' '$0=$0"\ttest"'

rm -rf "$TMPFILE" "$TMPDIRE"
}

function download_bless () {
TMPFILE="$(mktemp)"
TMPDIRE="$(mktemp -d)"
curl $CURL_OPTIONS "$OMER_URL" > "$TMPFILE"
unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null

echo -e 'word1\tword2\tlabel\tfold'
cat "${TMPDIRE}/lexical_entailment/bless2011/data_rnd_test.tsv" \
"${TMPDIRE}/lexical_entailment/bless2011/data_rnd_train.tsv" \
"${TMPDIRE}/lexical_entailment/bless2011/data_rnd_val.tsv" | \
tr -d '\15' | \
deterministic_shuffle | \
awk '{if (NR < 1454) {print $0 "\tval"} else {print $0 "\ttest"}}'

rm -rf "$TMPFILE" "$TMPDIRE"
}

function download_leds () {
TMPFILE="$(mktemp)"
TMPDIRE="$(mktemp -d)"
curl $CURL_OPTIONS "$OMER_URL" > "$TMPFILE"
unzip "$TMPFILE" -d "$TMPDIRE" > /dev/null

echo -e 'word1\tword2\tlabel\tfold'
cat "${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_test.tsv" \
"${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_train.tsv" \
"${TMPDIRE}/lexical_entailment/baroni2012/data_rnd_val.tsv" | \
tr -d '\15' | \
deterministic_shuffle | \
awk '{if (NR < 276) {print $0 "\tval"} else {print $0 "\ttest"}}'

rm -rf "$TMPFILE" "$TMPDIRE"
}

function download_shwartz () {
TMPFILE="$(mktemp)"
TMPDIRE="$(mktemp -d)"
curl $CURL_OPTIONS "$SHWARTZ_URL" > "$TMPFILE"

unrar x "$TMPFILE" "$TMPDIRE" >/dev/null
echo -e 'word1\tword2\tlabel\tfold'
cat "$TMPDIRE/dataset_rnd/train.tsv" \
"$TMPDIRE/dataset_rnd/test.tsv" \
"$TMPDIRE/dataset_rnd/val.tsv" | \
grep -v ' ' | \
deterministic_shuffle | \
awk '{if (NR < 5257) {print $0 "\tval"} else {print $0 "\ttest"}}'

rm -rf "$TMPFILE" "$TMPDIRE"
}

function download_bibless () {
echo -e 'word1\tword2\trelation\tlabel'
curl $CURL_OPTIONS "$KIMANH_REPO_URL/ABIBLESS.txt" | \
cut -f1,2,4 | \
awk -F'\t' '{if ($3 == "hyper") {print $0 "\t1"} else if ($3 == "other") {print $0 "\t0"} else {print $0 "\t-1"}}'
}

function download_wbless () {
echo -e 'word1\tword2\tlabel\trelation\tfold'
curl $CURL_OPTIONS "$KIMANH_REPO_URL/AWBLESS.txt" | \
deterministic_shuffle | \
awk '{if (NR < 168) {print $0 "\tval"} else {print $0 "\ttest"}}'
}

function download_eval () {
echo -e 'word1\tword2\tlabel\trelation\tfold'
curl $CURL_OPTIONS "$VERED_REPO_URL/EVALution.val" "$VERED_REPO_URL/EVALution.test" | \
sort | uniq | sed 's/-[jvn]\t/\t/g' | \
deterministic_shuffle | \
awk '{if (NR < 737) {print $0 "\tval"} else {print $0 "\ttest"}}'
}


# Let the user specify output directory, default to `data`
# Ex: `HYPERNYMY_DATA_OUTPUT=.my_data_dir bash download_data.sh`
if [ -z $HYPERNYMY_DATA_OUTPUT ]; then
HYPERNYMY_DATA_OUTPUT="data"
fi

if [ -d "$HYPERNYMY_DATA_OUTPUT" ]
then
echo "Warning: Already found the data. Please run 'rm -rf $HYPERNYMY_DATA_OUTPUT'" >&2
exit 1
fi

if [ ! -x "$(command -v unrar)" ]
then
warning "This script requires the 'unrar' tool. Please run"
warning " brew install unrar"
warning "or whatever your system's equivalent is."
exit 1
fi

if [ ! -x "$(command -v openssl)" ]
then
warning "This script requires the 'openssl' tool. Please run"
warning " brew install unrar"
warning "or whatever your system's equivalent is."
exit 1
fi



# prep the output folder
mkdir -p "$HYPERNYMY_DATA_OUTPUT"


warning "[1/7] Downloading BLESS"
download_bless > "$HYPERNYMY_DATA_OUTPUT/bless.tsv"

warning "[2/7] Downloading LEDS"
download_leds > "$HYPERNYMY_DATA_OUTPUT/leds.tsv"

warning "[3/7] Downloading EVAL"
download_eval > "$HYPERNYMY_DATA_OUTPUT/eval.tsv"

warning "[4/7] Downloading Shwartz"
download_shwartz > "$HYPERNYMY_DATA_OUTPUT/shwartz.tsv"

warning "[5/7] Downloading Hyperlex"
download_hyperlex > "$HYPERNYMY_DATA_OUTPUT/hyperlex_rnd.tsv"

warning "[6/7] Downloading WBLESS"
download_wbless > "$HYPERNYMY_DATA_OUTPUT/wbless.tsv"

warning "[7/7] Downloading BiBLESS"
download_bibless > "$HYPERNYMY_DATA_OUTPUT/bibless.tsv"

warning "All done."
Loading

0 comments on commit 7547de1

Please sign in to comment.