From 13ee8091ed4eb85430f5507740f149d83010d953 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 1 Oct 2021 16:44:05 -0400 Subject: [PATCH] Self contained auto-encoder classes --- examples/trainers/wiki_lstm.py | 145 +++++++++++++++++++++ examples/trainers/wiki_transformer.py | 16 +-- setup.py | 2 +- text2vec/autoencoders.py | 178 +++++++++++++++++++++++--- text2vec/models/sequential.py | 4 +- 5 files changed, 316 insertions(+), 29 deletions(-) create mode 100644 examples/trainers/wiki_lstm.py diff --git a/examples/trainers/wiki_lstm.py b/examples/trainers/wiki_lstm.py new file mode 100644 index 0000000..60f9a30 --- /dev/null +++ b/examples/trainers/wiki_lstm.py @@ -0,0 +1,145 @@ +from typing import Generator, List, Tuple, Union +import os + +import datasets +import tokenizers +from tokenizers import models +from tokenizers import decoders +from tokenizers import normalizers +from tokenizers import pre_tokenizers +from tokenizers import processors +from tokenizers import trainers +from nltk.tokenize import PunktSentenceTokenizer + +import numpy as np +import tensorflow as tf +from tensorboard.plugins import projector + +from text2vec.autoencoders import LstmAutoEncoder +from text2vec.optimizer_tools import RampUpDecaySchedule + +os.environ["TOKENIZERS_PARALLELISM"] = "true" +sent_tokenizer = PunktSentenceTokenizer().tokenize + + +def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: + tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="")) + tokenizer.decoder = decoders.WordPiece() + + tokenizer.normalizer = normalizers.Sequence([ + normalizers.NFD(), # NFD unicode normalizer + normalizers.Lowercase(), + normalizers.StripAccents() + ]) + tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ + pre_tokenizers.Whitespace(), + pre_tokenizers.Digits(individual_digits=False) + ]) + tokenizer.post_processor = processors.TemplateProcessing( + single="$A ", + pair="$A [SEP] $B:1", + special_tokens=[("[SEP]", 1), ("", 2), ("", 3)] + ) + + dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") + + def batch_iterator(batch_size=1000): + for i in range(0, len(dataset), batch_size): + yield dataset[i: i + batch_size]["text"] + + tokenizer.train_from_iterator( + batch_iterator(), + trainer=trainers.WordPieceTrainer( + vocab_size=10000, + special_tokens=["", "[SEP]", "", ""] + ) + ) + + def generator(): + for record in dataset: + if record['text'].strip() != '': + for sentence in sent_tokenizer(record['text']): + yield sentence + + data = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec(shape=(None), dtype=tf.string))) + data = data.map(tf.strings.strip, num_parallel_calls=tf.data.experimental.AUTOTUNE) + return tokenizer, data + + +def main(save_path: str): + if not os.path.isdir(save_path): + os.mkdir(save_path) + + tokenizer, data = train_tokenizer() + tokenizer.enable_truncation(2 * 512 + 1) # encoding + decoding + [SEP] token + + with open(f"{save_path}/metadata.tsv", "w") as tsv: + for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda s: s[-1]): + tsv.write(f"{token}\n") + + def encode(x): + def token_mapper(text: Union[str, List[str]]): + text = text.numpy() + + if isinstance(text, np.ndarray): + enc, dec = [], [] + for batch in tokenizer.encode_batch([(t.decode('utf8'), t.decode('utf8')) for t in text]): + enc_, dec_ = ' '.join(batch.tokens).split(' [SEP] ') + enc.append(enc_) + dec.append(dec_) + return (enc, dec) + + text = text.decode('utf8') + enc, dec = ' '.join(tokenizer.encode(text, pair=text).tokens).split(' [SEP] ') + return (enc, dec) + + return tf.py_function(token_mapper, inp=[x], Tout=[tf.string, tf.string]) + + model = LstmAutoEncoder( + max_sequence_len=512, + embedding_size=128, + token_hash=tokenizer.get_vocab(), + input_keep_prob=0.7, + hidden_keep_prob=0.5 + ) + model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=RampUpDecaySchedule(embedding_size=128))) + checkpoint = tf.train.Checkpoint(Classifier=model, optimizer=model.optimizer) + checkpoint_manager = tf.train.CheckpointManager(checkpoint, save_path, max_to_keep=3) + + # add word labels to the projector + config = projector.ProjectorConfig() + # pylint: disable=no-member + embeddings_config = config.embeddings.add() + + checkpoint_manager.save() + reader = tf.train.load_checkpoint(save_path) + embeddings_config.tensor_name = [key for key in reader.get_variable_to_shape_map() if "embedding" in key][0] + embeddings_config.metadata_path = f"{save_path}/metadata.tsv" + projector.visualize_embeddings(logdir=save_path, config=config) + + data = data.map(encode, num_parallel_calls=tf.data.experimental.AUTOTUNE) + model.fit( + x=data.prefetch(8).batch(64), + callbacks=[ + tf.keras.callbacks.TensorBoard( + log_dir=save_path, + write_graph=True, + update_freq=100 + ), + tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: checkpoint_manager.save()) + ], + epochs=1 + ) + + model.save( + filepath=f"{save_path}/saved_model", + save_format="tf", + include_optimizer=False, + signatures={"serving_default": model.embed, "token_embed": model.token_embed} + ) + tokenizer.save(path=f"{save_path}/tokenizer.json") + return model + + +if __name__ == '__main__': + main(save_path='./wiki_t2v') diff --git a/examples/trainers/wiki_transformer.py b/examples/trainers/wiki_transformer.py index de5cd15..f0dde3e 100644 --- a/examples/trainers/wiki_transformer.py +++ b/examples/trainers/wiki_transformer.py @@ -17,7 +17,6 @@ from text2vec.autoencoders import TransformerAutoEncoder from text2vec.optimizer_tools import RampUpDecaySchedule -from text2vec.training_tools import ServingModel os.environ["TOKENIZERS_PARALLELISM"] = "true" sent_tokenizer = PunktSentenceTokenizer().tokenize @@ -132,16 +131,11 @@ def token_mapper(text: Union[str, List[str]]): epochs=1 ) - # tf.keras.models.save_model(model, filepath=f"{save_path}/saved_model", include_optimizer=False, save_format="tf") - serve_model = ServingModel( - tokenizer=model.tokenizer, - embed_layer=model.embed_layer, - encode_layer=model.encode_layer - ) - tf.saved_model.save( - obj=serve_model, - export_dir=f"{save_path}/saved_model", - signatures={"serving_default": serve_model.embed, "token_embed": serve_model.token_embed} + model.save( + filepath=f"{save_path}/saved_model", + save_format="tf", + include_optimizer=False, + signatures={"serving_default": model.embed, "token_embed": model.token_embed} ) tokenizer.save(path=f"{save_path}/tokenizer.json") return model diff --git a/setup.py b/setup.py index 0e0841e..37d2ef0 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="text2vec", - version="1.3.1", + version="1.4.0", description="Building blocks for text vectorization and embedding", author="Dave Hollander", author_url="https://github.com/brainsqueeze", diff --git a/text2vec/autoencoders.py b/text2vec/autoencoders.py index 074e4ed..8997ecc 100644 --- a/text2vec/autoencoders.py +++ b/text2vec/autoencoders.py @@ -1,4 +1,6 @@ # pylint: disable=too-many-ancestors +from typing import Dict + import tensorflow as tf from text2vec.models.components.feeder import Tokenizer @@ -85,15 +87,19 @@ def __init__(self, max_sequence_len: int, embedding_size: int, self.encode_layer = TransformerEncoder(n_stacks=1, layers=8, **params) self.decode_layer = TransformerDecoder(n_stacks=1, layers=8, **params) + def call(self, tokens, **kwargs): + tokens = self.tokenizer(tokens) + x_enc, enc_mask, _ = self.embed_layer(tokens, **kwargs) + x_enc, context = self.encode_layer(x_enc, mask=enc_mask, training=kwargs.get("training", False)) + return x_enc, context, enc_mask + def train_step(self, data): encoding_tok, decoding_tok = data - encoding_tok = self.tokenizer(encoding_tok) decoding_tok = self.tokenizer(decoding_tok) with tf.GradientTape() as tape: with tf.name_scope('Encoding'): - x_enc, enc_mask, _ = self.embed_layer(encoding_tok) - x_enc, context = self.encode_layer(x_enc, mask=enc_mask, training=True) + x_enc, context, enc_mask = self(encoding_tok, training=True) with tf.name_scope('Decoding'): targets = decoding_tok[:, 1:] # skip the token with the slice on axis=1 @@ -128,10 +134,78 @@ def train_step(self, data): return {"loss": loss, 'learning_rate': self.optimizer.learning_rate(self.optimizer.iterations)} return {"loss": loss, 'learning_rate': self.optimizer.learning_rate} - def call(self, tokens, **kwargs): - tokens = self.tokenizer(tf.squeeze(tokens)) - x_enc, enc_mask, _ = self.embed_layer(tokens, **kwargs) - return self.encode_layer(x_enc, mask=enc_mask, **kwargs) + def test_step(self, data): + encoding_tok, decoding_tok = data + decoding_tok = self.tokenizer(decoding_tok) + + with tf.name_scope('Encoding'): + x_enc, context, enc_mask = self(encoding_tok, training=False) + + with tf.name_scope('Decoding'): + targets = tf.ragged.map_flat_values(self.embed_layer.table.lookup, decoding_tok[:, 1:]) # skip + targets = self.embed_layer.slicer(targets) + + decoding_tok, dec_mask, _ = self.embed_layer(decoding_tok[:, :-1]) # skip + decoding_tok = self.decode_layer( + x_enc=x_enc, + enc_mask=enc_mask, + x_dec=decoding_tok, + dec_mask=dec_mask, + context=context, + attention=self.encode_layer.attention, + training=False + ) + decoding_tok = tf.tensordot(decoding_tok, self.embed_layer.embeddings, axes=[2, 1]) + + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=decoding_tok, + labels=targets.to_tensor(default_value=0) + ) + loss = loss * dec_mask + loss = tf.math.reduce_sum(loss, axis=1) + loss = tf.reduce_mean(loss) + return {"loss": loss, **{m.name: m.result() for m in self.metrics}} + + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) + def embed(self, sentences) -> Dict[str, tf.Tensor]: + """Takes batches of free text and returns context vectors for each example. + + Parameters + ---------- + sentences : tf.Tensor + Tensor of dtype tf.string. + + Returns + ------- + Dict[str, tf.Tensor] + Attention vector and hidden state sequences with shapes (batch_size, embedding_size) + and (batch_size, max_sequence_len, embedding_size) respectively. + """ + + sequences, attention, _ = self(sentences, training=False) + return {"sequences": sequences, "attention": attention} + + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) + def token_embed(self, sentences) -> Dict[str, tf.Tensor]: + """Takes batches of free text and returns word embeddings along with the associate token. + + Parameters + ---------- + sentences : tf.Tensor + Tensor of dtype tf.string. + + Returns + ------- + Dict[str, tf.Tensor] + Padded tokens and embedding vectors with shapes (batch_size, max_sequence_len) + and (batch_size, max_sequence_len, embedding_size) respectively. + """ + + tokens = self.tokenizer(sentences) + return { + "tokens": tokens.to_tensor(''), + "embeddings": self.embed_layer.get_embedding(tokens).to_tensor(0) + } class LstmAutoEncoder(tf.keras.Model): @@ -211,15 +285,19 @@ def __init__(self, max_sequence_len: int, embedding_size: int, num_hidden: int = self.encode_layer = RecurrentEncoder(num_hidden=num_hidden, **params) self.decode_layer = RecurrentDecoder(num_hidden=num_hidden, **params) + def call(self, tokens, **kwargs): + tokens = self.tokenizer(tokens) + x_enc, enc_mask, _ = self.embed_layer(tokens, **kwargs) + x_enc, context, *states = self.encode_layer(x_enc, mask=enc_mask, training=kwargs.get("training", False)) + return x_enc, context, enc_mask, states + def train_step(self, data): encoding_tok, decoding_tok = data - encoding_tok = self.tokenizer(encoding_tok) decoding_tok = self.tokenizer(decoding_tok) with tf.GradientTape() as tape: with tf.name_scope('Encoding'): - x_enc, enc_mask, _ = self.embed_layer(encoding_tok) - x_enc, context, *states = self.encode_layer(x_enc, mask=enc_mask, training=True) + x_enc, context, enc_mask, states = self(encoding_tok, training=True) with tf.name_scope('Decoding'): targets = decoding_tok[:, 1:] # skip the token with the slice on axis=1 @@ -234,7 +312,6 @@ def train_step(self, data): x_dec=decoding_tok, dec_mask=dec_mask, context=context, - attention=None, initial_state=states, training=True ) @@ -255,7 +332,78 @@ def train_step(self, data): return {"loss": loss, 'learning_rate': self.optimizer.learning_rate(self.optimizer.iterations)} return {"loss": loss, 'learning_rate': self.optimizer.learning_rate} - def call(self, tokens, **kwargs): - tokens = self.tokenizer(tf.squeeze(tokens)) - x_enc, enc_mask, _ = self.embed_layer(tokens, **kwargs) - return self.encode_layer(x_enc, mask=enc_mask, **kwargs) + def test_step(self, data): + encoding_tok, decoding_tok = data + decoding_tok = self.tokenizer(decoding_tok) + + with tf.name_scope('Encoding'): + x_enc, context, enc_mask, states = self(encoding_tok, training=False) + + with tf.name_scope('Decoding'): + targets = decoding_tok[:, 1:] # skip the token with the slice on axis=1 + if isinstance(self.embed_layer, TokenEmbed): + targets = tf.ragged.map_flat_values(self.embed_layer.table.lookup, targets) + targets = self.embed_layer.slicer(targets) + + decoding_tok, dec_mask, _ = self.embed_layer(decoding_tok[:, :-1]) + decoding_tok = self.decode_layer( + x_enc=x_enc, + enc_mask=enc_mask, + x_dec=decoding_tok, + dec_mask=dec_mask, + context=context, + initial_state=states, + training=False + ) + decoding_tok = tf.tensordot(decoding_tok, self.embed_layer.embeddings, axes=[2, 1]) + + loss = loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=decoding_tok, + labels=targets.to_tensor(default_value=0) + ) + loss = loss * dec_mask + loss = tf.math.reduce_sum(loss, axis=1) + loss = tf.reduce_mean(loss) + + return {"loss": loss, **{m.name: m.result() for m in self.metrics}} + + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) + def embed(self, sentences) -> Dict[str, tf.Tensor]: + """Takes batches of free text and returns context vectors for each example. + + Parameters + ---------- + sentences : tf.Tensor + Tensor of dtype tf.string. + + Returns + ------- + Dict[str, tf.Tensor] + Attention vector and hidden state sequences with shapes (batch_size, embedding_size) + and (batch_size, max_sequence_len, embedding_size) respectively. + """ + + sequences, attention, *args = self(sentences, training=False) + return {"sequences": sequences, "attention": attention} + + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) + def token_embed(self, sentences) -> Dict[str, tf.Tensor]: + """Takes batches of free text and returns word embeddings along with the associate token. + + Parameters + ---------- + sentences : tf.Tensor + Tensor of dtype tf.string. + + Returns + ------- + Dict[str, tf.Tensor] + Padded tokens and embedding vectors with shapes (batch_size, max_sequence_len) + and (batch_size, max_sequence_len, embedding_size) respectively. + """ + + tokens = self.tokenizer(sentences) + return { + "tokens": tokens.to_tensor(''), + "embeddings": self.embed_layer.get_embedding(tokens).to_tensor(0) + } diff --git a/text2vec/models/sequential.py b/text2vec/models/sequential.py index 6b252dd..c740cf5 100644 --- a/text2vec/models/sequential.py +++ b/text2vec/models/sequential.py @@ -52,7 +52,7 @@ def call(self, x, mask, training=False, **kwargs): mask = tf.expand_dims(mask, axis=-1) x = self.drop(x, training=training) x, states = self.bi_lstm(x * mask, training=training) - x, context = self.attention(x * mask) + context = self.attention(x * mask) if training: return x, context, states @@ -91,7 +91,7 @@ def __init__(self, max_sequence_len, num_hidden, embedding_size=50, num_layers=2 self.bi_lstm = BidirectionalLSTM(num_layers=num_layers, num_hidden=num_hidden, return_states=False) self.dense = tf.keras.layers.Dense(units=dims, activation=tf.nn.relu) - def call(self, x_enc, enc_mask, x_dec, dec_mask, context, attention, training=False, **kwargs): + def call(self, x_enc, enc_mask, x_dec, dec_mask, context, training=False, **kwargs): with tf.name_scope("RecurrentDecoder"): enc_mask = tf.expand_dims(enc_mask, axis=-1) dec_mask = tf.expand_dims(dec_mask, axis=-1)