Skip to content

Commit

Permalink
breaking changes on ServingModel
Browse files Browse the repository at this point in the history
  • Loading branch information
brainsqueeze committed Sep 27, 2021
1 parent b313c8a commit 4715430
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 30 deletions.
17 changes: 14 additions & 3 deletions examples/trainers/wiki_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from text2vec.autoencoders import TransformerAutoEncoder
from text2vec.optimizer_tools import RampUpDecaySchedule
from text2vec.training_tools import ServingModel

os.environ["TOKENIZERS_PARALLELISM"] = "true"
sent_tokenizer = PunktSentenceTokenizer().tokenize
Expand All @@ -41,7 +42,7 @@ def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]:
special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]
)

dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test")

def batch_iterator(batch_size=1000):
for i in range(0, len(dataset), batch_size):
Expand Down Expand Up @@ -71,7 +72,6 @@ def main(save_path: str):
os.mkdir(save_path)

tokenizer, data = train_tokenizer()
tokenizer.save(path=f"{save_path}/tokenizer.json")
tokenizer.enable_truncation(2 * 512 + 1) # encoding + decoding + [SEP] token

with open(f"{save_path}/metadata.tsv", "w") as tsv:
Expand Down Expand Up @@ -132,7 +132,18 @@ def token_mapper(text: Union[str, List[str]]):
epochs=1
)

tf.keras.models.save_model(model, filepath=f"{save_path}/saved_model", include_optimizer=False, save_format="tf")
# tf.keras.models.save_model(model, filepath=f"{save_path}/saved_model", include_optimizer=False, save_format="tf")
serve_model = ServingModel(
tokenizer=model.tokenizer,
embed_layer=model.embed_layer,
encode_layer=model.encode_layer
)
tf.saved_model.save(
obj=serve_model,
export_dir=f"{save_path}/saved_model",
signatures={"serving_default": serve_model.embed, "token_embed": serve_model.token_embed}
)
tokenizer.save(path=f"{save_path}/tokenizer.json")
return model


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="text2vec",
version="1.1.7",
version="1.2.0",
description="Building blocks for text vectorization and embedding",
author="Dave Hollander",
author_url="https://github.com/brainsqueeze",
Expand Down
57 changes: 31 additions & 26 deletions text2vec/training_tools.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Dict, Union
import tensorflow as tf

from text2vec.models import TextInput
from text2vec.models import Tokenizer
from text2vec.models import TokenEmbed
from text2vec.models import Embed
from text2vec.models import TransformerEncoder
from text2vec.models import TransformerDecoder
from text2vec.models import RecurrentEncoder
Expand Down Expand Up @@ -98,15 +101,15 @@ def call(self, sentences, training=False, return_vectors=False):
eos = tf.fill([batch_size], value='</s>', name='eos-tag')
eos = tf.expand_dims(eos, axis=-1, name='eos-tag-expand')

targets = tf.concat([tokens, eos], axis=1, name='eos-concat')
targets = tf.concat([tokens, eos], 1, name='eos-concat')
targets = tf.ragged.map_flat_values(self.embed_layer.table.lookup, targets)
targets = self.embed_layer.slicer(targets)

with tf.name_scope('decode-tokens'):
bos = tf.fill([batch_size], value='<s>', name='bos-tag')
bos = tf.expand_dims(bos, axis=-1, name='bos-tag-expand')

dec_tokens = tf.concat([bos, tokens], axis=-1, name='bos-concat')
dec_tokens = tf.concat([bos, tokens], -1, name='bos-concat')
x_dec, dec_mask, dec_time_steps = self.embed_layer(dec_tokens)
x_out = self.decode_layer(
x_enc=x_enc,
Expand Down Expand Up @@ -166,28 +169,25 @@ def token_embed(self, sentences):
class ServingModel(tf.keras.Model):
"""Wrapper class for packaging final layers prior to saving.
Parameters
----------
embed_layer : TextInput
Trained embedding layer.
encode_layer : (TransformerEncoder or RecurrentEncoder)
Trained encoding layer.
sep : str, optional
Token separator, by default ' '
"""
Parameters
----------
embed_layer : Union[TokenEmbed, Embed]
text2vec `TokenEmbed` or `Embed` layer
encode_layer : Union[TransformerEncoder, RecurrentEncoder]
text2vec `TransformerEncoder` or `RecurrentEncoder` layer
tokenizer : Tokenizer
text2vec `Tokenizer` layer
"""

def __init__(self, embed_layer, encode_layer, sep=' '):
def __init__(self, embed_layer: Union[TokenEmbed, Embed],
encode_layer: Union[TransformerEncoder, RecurrentEncoder], tokenizer: Tokenizer):
super().__init__()

assert isinstance(embed_layer, TextInput)
assert type(encode_layer) in {RecurrentEncoder, TransformerEncoder}

self.embed_layer = embed_layer
self.tokenizer = Tokenizer(sep)
self.tokenizer = tokenizer
self.encode_layer = encode_layer

@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
def embed(self, sentences):
def embed(self, sentences) -> Dict[str, tf.Tensor]:
"""Takes batches of free text and returns context vectors for each example.
Parameters
Expand All @@ -197,16 +197,18 @@ def embed(self, sentences):
Returns
-------
tf.Tensor
Context vectors of shape (batch_size, embedding_size)
Dict[str, tf.Tensor]
Attention vector and hidden state sequences with shapes (batch_size, embedding_size)
and (batch_size, max_sequence_len, embedding_size) respectively.
"""

tokens = self.tokenizer(sentences) # turn sentences into ragged tensors of tokens
x_enc, enc_mask, _ = self.embed_layer(tokens)
return self.encode_layer(x_enc, mask=enc_mask, training=False)
sequences, attention = self.encode_layer(x_enc, mask=enc_mask, training=False)
return {"sequences": sequences, "attention": attention}

@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
def token_embed(self, sentences):
def token_embed(self, sentences) -> Dict[str, tf.Tensor]:
"""Takes batches of free text and returns word embeddings along with the associate token.
Parameters
Expand All @@ -216,13 +218,16 @@ def token_embed(self, sentences):
Returns
-------
(tf.Tensor, tf.Tensor)
Tuple of (tokens, word_embeddings) with shapes (batch_size, max_sequence_len)
Dict[str, tf.Tensor]
Padded tokens and embedding vectors with shapes (batch_size, max_sequence_len)
and (batch_size, max_sequence_len, embedding_size) respectively.
"""

tokens = self.tokenizer(sentences) # turn sentences into ragged tensors of tokens
return tokens.to_tensor(''), self.embed_layer(tokens, output_embeddings=True).to_tensor(0)
return {
"tokens": tokens.to_tensor('</>'),
"embeddings": self.embed_layer.get_embedding(tokens).to_tensor(0)
}


def sequence_cost(target_sequences, sequence_logits, num_labels, smoothing=False):
Expand Down Expand Up @@ -264,7 +269,7 @@ def sequence_cost(target_sequences, sequence_logits, num_labels, smoothing=False

def vector_cost(context_vectors):
"""Cost constraint on the cosine similarity of context vectors. Diagonal elements (self-context)
are coerced to be closer to 1 (self-consistency). Off-diagonal elements are pushed toward 0,
are coerced to be closer to 1 (self-consistency). Off-diagonal elements are pushed toward 0,
indicating not contextually similar.
Parameters
Expand Down

0 comments on commit 4715430

Please sign in to comment.