Skip to content

Commit

Permalink
Merge pull request #14 from brainsqueeze/dev
Browse files Browse the repository at this point in the history
revert training params, UTF-8 chars in metadata
  • Loading branch information
brainsqueeze authored Jul 7, 2022
2 parents c00e3a8 + 2f4461d commit 27fd3aa
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
6 changes: 3 additions & 3 deletions examples/trainers/news_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def main(save_path: str):
os.mkdir(save_path)

tokenizer, data = train_tokenizer()
with open(f"{save_path}/metadata.tsv", "w") as tsv:
with open(f"{save_path}/metadata.tsv", "w", encoding="utf8") as tsv:
for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda s: s[-1]):
tsv.write(f"{token}\n")
tsv.write(f"{token.encode('utf8')}\n")

model = LstmAutoEncoder(
max_sequence_len=MAX_SEQUENCE_LENGTH,
Expand All @@ -115,7 +115,7 @@ def main(save_path: str):
projector.visualize_embeddings(logdir=save_path, config=config)

model.fit(
x=data.prefetch(8).batch(64),
x=data.prefetch(8).shuffle(10_000).batch(64),
callbacks=[
callbacks.TensorBoard(log_dir=save_path, write_graph=True, update_freq=100),
callbacks.LambdaCallback(
Expand Down
10 changes: 5 additions & 5 deletions examples/trainers/news_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def train_tokenizer() -> Tuple[tokenizers.Tokenizer, tf.data.Dataset]:
pre_tokenizers.Digits(individual_digits=False)
])

dataset = datasets.load_dataset("multi_news", split="test")
dataset = datasets.load_dataset("multi_news", split="train")

def batch_iterator(batch_size=1000):
for i in range(0, len(dataset), batch_size):
Expand Down Expand Up @@ -86,9 +86,9 @@ def main(save_path: str):
os.mkdir(save_path)

tokenizer, data = train_tokenizer()
with open(f"{save_path}/metadata.tsv", "w") as tsv:
with open(f"{save_path}/metadata.tsv", "w", encoding="utf8") as tsv:
for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda s: s[-1]):
tsv.write(f"{token}\n")
tsv.write(f"{token.encode('utf8')}\n")

model = TransformerAutoEncoder(
max_sequence_len=MAX_SEQUENCE_LENGTH,
Expand All @@ -115,7 +115,7 @@ def main(save_path: str):
projector.visualize_embeddings(logdir=save_path, config=config)

model.fit(
x=data.prefetch(8).batch(64),
x=data.prefetch(8).shuffle(10_000).batch(64),
callbacks=[
callbacks.TensorBoard(log_dir=save_path, write_graph=True, update_freq=100),
callbacks.LambdaCallback(
Expand All @@ -126,7 +126,7 @@ def main(save_path: str):
)
)
],
epochs=2
epochs=10
)

model.save(
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="text2vec",
version="2.0.0",
version="2.0.1",
description="Building blocks for text vectorization and embedding",
author="Dave Hollander",
author_url="https://github.com/brainsqueeze",
Expand Down

0 comments on commit 27fd3aa

Please sign in to comment.