From 2f4461d1d0a38d6a06c51a6a9568d0ce2bab094b Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 7 Jul 2022 17:25:44 -0400 Subject: [PATCH] revert training params, UTF-8 chars in metadata --- examples/trainers/news_lstm.py | 6 +++--- examples/trainers/news_transformer.py | 10 +++++----- setup.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/trainers/news_lstm.py b/examples/trainers/news_lstm.py index 5b136ba..87f8b9a 100644 --- a/examples/trainers/news_lstm.py +++ b/examples/trainers/news_lstm.py @@ -86,9 +86,9 @@ def main(save_path: str): os.mkdir(save_path) tokenizer, data = train_tokenizer() - with open(f"{save_path}/metadata.tsv", "w") as tsv: + with open(f"{save_path}/metadata.tsv", "w", encoding="utf8") as tsv: for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda s: s[-1]): - tsv.write(f"{token}\n") + tsv.write(f"{token.encode('utf8')}\n") model = LstmAutoEncoder( max_sequence_len=MAX_SEQUENCE_LENGTH, @@ -115,7 +115,7 @@ def main(save_path: str): projector.visualize_embeddings(logdir=save_path, config=config) model.fit( - x=data.prefetch(8).batch(64), + x=data.prefetch(8).shuffle(10_000).batch(64), callbacks=[ callbacks.TensorBoard(log_dir=save_path, write_graph=True, update_freq=100), callbacks.LambdaCallback( diff --git a/examples/trainers/news_transformer.py b/examples/trainers/news_transformer.py index 95941f2..780ce1c 100644 --- a/examples/trainers/news_transformer.py +++ b/examples/trainers/news_transformer.py @@ -37,7 +37,7 @@ def train_tokenizer() -> Tuple[tokenizers.Tokenizer, tf.data.Dataset]: pre_tokenizers.Digits(individual_digits=False) ]) - dataset = datasets.load_dataset("multi_news", split="test") + dataset = datasets.load_dataset("multi_news", split="train") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): @@ -86,9 +86,9 @@ def main(save_path: str): os.mkdir(save_path) tokenizer, data = train_tokenizer() - with open(f"{save_path}/metadata.tsv", "w") as tsv: + with open(f"{save_path}/metadata.tsv", "w", encoding="utf8") as tsv: for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda s: s[-1]): - tsv.write(f"{token}\n") + tsv.write(f"{token.encode('utf8')}\n") model = TransformerAutoEncoder( max_sequence_len=MAX_SEQUENCE_LENGTH, @@ -115,7 +115,7 @@ def main(save_path: str): projector.visualize_embeddings(logdir=save_path, config=config) model.fit( - x=data.prefetch(8).batch(64), + x=data.prefetch(8).shuffle(10_000).batch(64), callbacks=[ callbacks.TensorBoard(log_dir=save_path, write_graph=True, update_freq=100), callbacks.LambdaCallback( @@ -126,7 +126,7 @@ def main(save_path: str): ) ) ], - epochs=2 + epochs=10 ) model.save( diff --git a/setup.py b/setup.py index 7d85d2a..2d06597 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="text2vec", - version="2.0.0", + version="2.0.1", description="Building blocks for text vectorization and embedding", author="Dave Hollander", author_url="https://github.com/brainsqueeze",