From 47154303f266d81b01bac073515cde0db9456042 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 27 Sep 2021 13:32:27 -0400 Subject: [PATCH] breaking changes on ServingModel --- examples/trainers/wiki_transformer.py | 17 ++++++-- setup.py | 2 +- text2vec/training_tools.py | 57 +++++++++++++++------------ 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/examples/trainers/wiki_transformer.py b/examples/trainers/wiki_transformer.py index ca64967..de5cd15 100644 --- a/examples/trainers/wiki_transformer.py +++ b/examples/trainers/wiki_transformer.py @@ -17,6 +17,7 @@ from text2vec.autoencoders import TransformerAutoEncoder from text2vec.optimizer_tools import RampUpDecaySchedule +from text2vec.training_tools import ServingModel os.environ["TOKENIZERS_PARALLELISM"] = "true" sent_tokenizer = PunktSentenceTokenizer().tokenize @@ -41,7 +42,7 @@ def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: special_tokens=[("[SEP]", 1), ("", 2), ("", 3)] ) - dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train") + dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): @@ -71,7 +72,6 @@ def main(save_path: str): os.mkdir(save_path) tokenizer, data = train_tokenizer() - tokenizer.save(path=f"{save_path}/tokenizer.json") tokenizer.enable_truncation(2 * 512 + 1) # encoding + decoding + [SEP] token with open(f"{save_path}/metadata.tsv", "w") as tsv: @@ -132,7 +132,18 @@ def token_mapper(text: Union[str, List[str]]): epochs=1 ) - tf.keras.models.save_model(model, filepath=f"{save_path}/saved_model", include_optimizer=False, save_format="tf") + # tf.keras.models.save_model(model, filepath=f"{save_path}/saved_model", include_optimizer=False, save_format="tf") + serve_model = ServingModel( + tokenizer=model.tokenizer, + embed_layer=model.embed_layer, + encode_layer=model.encode_layer + ) + tf.saved_model.save( + obj=serve_model, + export_dir=f"{save_path}/saved_model", + signatures={"serving_default": serve_model.embed, "token_embed": serve_model.token_embed} + ) + tokenizer.save(path=f"{save_path}/tokenizer.json") return model diff --git a/setup.py b/setup.py index 0323625..0085c75 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="text2vec", - version="1.1.7", + version="1.2.0", description="Building blocks for text vectorization and embedding", author="Dave Hollander", author_url="https://github.com/brainsqueeze", diff --git a/text2vec/training_tools.py b/text2vec/training_tools.py index 237c4f9..795abde 100644 --- a/text2vec/training_tools.py +++ b/text2vec/training_tools.py @@ -1,7 +1,10 @@ +from typing import Dict, Union import tensorflow as tf from text2vec.models import TextInput from text2vec.models import Tokenizer +from text2vec.models import TokenEmbed +from text2vec.models import Embed from text2vec.models import TransformerEncoder from text2vec.models import TransformerDecoder from text2vec.models import RecurrentEncoder @@ -98,7 +101,7 @@ def call(self, sentences, training=False, return_vectors=False): eos = tf.fill([batch_size], value='', name='eos-tag') eos = tf.expand_dims(eos, axis=-1, name='eos-tag-expand') - targets = tf.concat([tokens, eos], axis=1, name='eos-concat') + targets = tf.concat([tokens, eos], 1, name='eos-concat') targets = tf.ragged.map_flat_values(self.embed_layer.table.lookup, targets) targets = self.embed_layer.slicer(targets) @@ -106,7 +109,7 @@ def call(self, sentences, training=False, return_vectors=False): bos = tf.fill([batch_size], value='', name='bos-tag') bos = tf.expand_dims(bos, axis=-1, name='bos-tag-expand') - dec_tokens = tf.concat([bos, tokens], axis=-1, name='bos-concat') + dec_tokens = tf.concat([bos, tokens], -1, name='bos-concat') x_dec, dec_mask, dec_time_steps = self.embed_layer(dec_tokens) x_out = self.decode_layer( x_enc=x_enc, @@ -166,28 +169,25 @@ def token_embed(self, sentences): class ServingModel(tf.keras.Model): """Wrapper class for packaging final layers prior to saving. - Parameters - ---------- - embed_layer : TextInput - Trained embedding layer. - encode_layer : (TransformerEncoder or RecurrentEncoder) - Trained encoding layer. - sep : str, optional - Token separator, by default ' ' - """ + Parameters + ---------- + embed_layer : Union[TokenEmbed, Embed] + text2vec `TokenEmbed` or `Embed` layer + encode_layer : Union[TransformerEncoder, RecurrentEncoder] + text2vec `TransformerEncoder` or `RecurrentEncoder` layer + tokenizer : Tokenizer + text2vec `Tokenizer` layer + """ - def __init__(self, embed_layer, encode_layer, sep=' '): + def __init__(self, embed_layer: Union[TokenEmbed, Embed], + encode_layer: Union[TransformerEncoder, RecurrentEncoder], tokenizer: Tokenizer): super().__init__() - - assert isinstance(embed_layer, TextInput) - assert type(encode_layer) in {RecurrentEncoder, TransformerEncoder} - self.embed_layer = embed_layer - self.tokenizer = Tokenizer(sep) + self.tokenizer = tokenizer self.encode_layer = encode_layer @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) - def embed(self, sentences): + def embed(self, sentences) -> Dict[str, tf.Tensor]: """Takes batches of free text and returns context vectors for each example. Parameters @@ -197,16 +197,18 @@ def embed(self, sentences): Returns ------- - tf.Tensor - Context vectors of shape (batch_size, embedding_size) + Dict[str, tf.Tensor] + Attention vector and hidden state sequences with shapes (batch_size, embedding_size) + and (batch_size, max_sequence_len, embedding_size) respectively. """ tokens = self.tokenizer(sentences) # turn sentences into ragged tensors of tokens x_enc, enc_mask, _ = self.embed_layer(tokens) - return self.encode_layer(x_enc, mask=enc_mask, training=False) + sequences, attention = self.encode_layer(x_enc, mask=enc_mask, training=False) + return {"sequences": sequences, "attention": attention} @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) - def token_embed(self, sentences): + def token_embed(self, sentences) -> Dict[str, tf.Tensor]: """Takes batches of free text and returns word embeddings along with the associate token. Parameters @@ -216,13 +218,16 @@ def token_embed(self, sentences): Returns ------- - (tf.Tensor, tf.Tensor) - Tuple of (tokens, word_embeddings) with shapes (batch_size, max_sequence_len) + Dict[str, tf.Tensor] + Padded tokens and embedding vectors with shapes (batch_size, max_sequence_len) and (batch_size, max_sequence_len, embedding_size) respectively. """ tokens = self.tokenizer(sentences) # turn sentences into ragged tensors of tokens - return tokens.to_tensor(''), self.embed_layer(tokens, output_embeddings=True).to_tensor(0) + return { + "tokens": tokens.to_tensor(''), + "embeddings": self.embed_layer.get_embedding(tokens).to_tensor(0) + } def sequence_cost(target_sequences, sequence_logits, num_labels, smoothing=False): @@ -264,7 +269,7 @@ def sequence_cost(target_sequences, sequence_logits, num_labels, smoothing=False def vector_cost(context_vectors): """Cost constraint on the cosine similarity of context vectors. Diagonal elements (self-context) - are coerced to be closer to 1 (self-consistency). Off-diagonal elements are pushed toward 0, + are coerced to be closer to 1 (self-consistency). Off-diagonal elements are pushed toward 0, indicating not contextually similar. Parameters