diff --git a/.gitignore b/.gitignore index 8456922..f25bfbc 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ multi_news_t2v*/ **/node_modules # ignore trained model files +*.tflite +*.onnx **/scratch* **/training/ **/validation/ diff --git a/examples/configurations/minimal_sequence.yml b/examples/configurations/minimal_sequence.yml deleted file mode 100644 index 85ac114..0000000 --- a/examples/configurations/minimal_sequence.yml +++ /dev/null @@ -1,16 +0,0 @@ -training: - tokens: 10000 - max_sequence_length: 512 - epochs: 100 - batch_size: 64 - data: - - - -model: - name: sequence_test - parameters: - embedding: 128 - hidden: 128 - storage_dir: /path/to/save/model - eval_sentences: - - "" \ No newline at end of file diff --git a/examples/configurations/minimal_transformer.yml b/examples/configurations/minimal_transformer.yml deleted file mode 100644 index 17b9ad5..0000000 --- a/examples/configurations/minimal_transformer.yml +++ /dev/null @@ -1,12 +0,0 @@ -training: - tokens: 10000 - max_sequence_length: 512 - epochs: 100 - batch_size: 64 - -model: - name: transformer_test - parameters: - embedding: 128 - layers: 8 - storage_dir: /path/to/save/model \ No newline at end of file diff --git a/setup.py b/setup.py index 2d06597..f5de045 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="text2vec", - version="2.0.1", + version="2.0.2", description="Building blocks for text vectorization and embedding", author="Dave Hollander", author_url="https://github.com/brainsqueeze", diff --git a/text2vec/models/components/attention.py b/text2vec/models/components/attention.py index 50400ec..e643b5c 100644 --- a/text2vec/models/components/attention.py +++ b/text2vec/models/components/attention.py @@ -37,7 +37,7 @@ class ScaledDotAttention(layers.Layer): """ def __init__(self): - super().__init__(name="ScaledDotAttention") + super().__init__() self.neg_inf = tf.constant(-1e9, dtype=tf.float32) # pylint: disable=missing-function-docstring @@ -78,7 +78,7 @@ class BahdanauAttention(layers.Layer): dims = 12 encoded_sequences = tf.random.uniform(shape=[4, 7, dims]) decoded_sequences = tf.random.uniform(shape=[4, 11, dims]) - attention = BahdanauAttention(dims) + attention = BahdanauAttention(dims, drop_rate=0.25) # self attention attention(encoded_sequences) @@ -89,7 +89,7 @@ class BahdanauAttention(layers.Layer): """ def __init__(self, size: int, drop_rate: float = 0.): - super().__init__(name="BahdanauAttention") + super().__init__() self.hidden = layers.Dense(units=size, activation="tanh") self.U = tf.Variable(initializers.GlorotUniform()(shape=[size]), name="U", dtype=tf.float32, trainable=True) @@ -141,7 +141,7 @@ class SingleHeadAttention(layers.Layer): V = tf.random.uniform(shape=[4, 5, 12]) # 25% dropout rate - attention = SingleHeadAttention(emb_dims=12, keep_prob=0.75) + attention = SingleHeadAttention(emb_dims=12, drop_rate=0.25) # masking and dropout turned on attention(inputs=(Q, K, V), mask_future=True, training=True) @@ -149,7 +149,7 @@ class SingleHeadAttention(layers.Layer): """ def __init__(self, emb_dims, num_layers: int = 8, drop_rate: float = 0.): - super().__init__(name="SingleHeadAttention") + super().__init__() assert isinstance(num_layers, int) and num_layers > 0 dims = emb_dims @@ -205,7 +205,7 @@ class MultiHeadAttention(layers.Layer): V = tf.random.uniform(shape=[4, 5, 12]) # 25% dropout rate - attention = MultiHeadAttention(emb_dims=12, keep_prob=0.75) + attention = MultiHeadAttention(emb_dims=12, drop_rate=0.25) # masking and dropout turned on attention(inputs=(Q, K, V), mask_future=True, training=True) @@ -213,7 +213,7 @@ class MultiHeadAttention(layers.Layer): """ def __init__(self, emb_dims: int, num_layers: int = 8, drop_rate: float = 0.): - super().__init__(name="MultiHeadAttention") + super().__init__() self.layer_heads = [ SingleHeadAttention(emb_dims=emb_dims, num_layers=num_layers, drop_rate=drop_rate) for _ in range(num_layers) diff --git a/text2vec/models/components/strings.py b/text2vec/models/components/strings.py index b8781f8..7de811f 100644 --- a/text2vec/models/components/strings.py +++ b/text2vec/models/components/strings.py @@ -1,5 +1,5 @@ import tensorflow as tf -from tensorflow.kersa import layers +from tensorflow.keras import layers from text2vec.models import Tokenizer @@ -74,3 +74,7 @@ def call(self, texts: tf.Tensor, substrings: tf.RaggedTensor) -> tf.RaggedTensor row_lengths=substrings.row_lengths() ) return tf.ragged.map_flat_values(self.find_match, ragged_texts, tf.strings.join([pre, substrings, post])) + + def get_config(self): + base_config = super().get_config() + return {**base_config, "sep": self.sep} diff --git a/text2vec/models/components/text_inputs.py b/text2vec/models/components/text_inputs.py index 26a1beb..c8177a7 100644 --- a/text2vec/models/components/text_inputs.py +++ b/text2vec/models/components/text_inputs.py @@ -26,12 +26,16 @@ class Tokenizer(layers.Layer): """ def __init__(self, sep: str = ' '): - super().__init__(name="Tokenizer") + super().__init__() self.sep = sep def call(self, corpus): return tf.strings.split(corpus, self.sep) + def get_config(self): + base_config = super().get_config() + return {**base_config, "sep": self.sep} + class Embed(layers.Layer): """This layer handles the primary text feature transformations and word-embeddings to be passed off @@ -113,7 +117,7 @@ def get_embedding(self, token_ids: tf.RaggedTensor) -> tf.RaggedTensor: return tf.ragged.map_flat_values(tf.nn.embedding_lookup, self.embeddings, token_ids) -class TokenEmbed(tf.keras.layers.Layer): +class TokenEmbed(layers.Layer): """This layer handles the primary text feature transformations and word-embeddings to be passed off to the sequence-aware parts of the encoder/decoder pipeline. @@ -155,13 +159,17 @@ class TokenEmbed(tf.keras.layers.Layer): def __init__(self, token_hash: dict, embedding_size: int, max_sequence_len: int, unknown_token: str = ''): super().__init__() - self.table = tf.lookup.StaticHashTable( - tf.lookup.KeyValueTensorInitializer( - keys=list(token_hash.keys()), - values=list(token_hash.values()) - ), - default_value=token_hash.get(unknown_token) - ) + self.lookup = token_hash + self.unknown_token = unknown_token + + with tf.init_scope(): + self.table = tf.lookup.StaticHashTable( + tf.lookup.KeyValueTensorInitializer( + keys=list(token_hash.keys()), + values=list(token_hash.values()) + ), + default_value=token_hash.get(unknown_token) + ) self.embed_layer = Embed( vocab_size=len(token_hash), embedding_size=embedding_size, @@ -169,9 +177,8 @@ def __init__(self, token_hash: dict, embedding_size: int, max_sequence_len: int, ) def call(self, tokens, **kwargs): - with tf.name_scope("TextInput"): - hashed = tf.ragged.map_flat_values(self.table.lookup, tokens) - return self.embed_layer(hashed, **kwargs) + hashed = tf.ragged.map_flat_values(self.table.lookup, tokens) + return self.embed_layer(hashed, **kwargs) def get_embedding(self, tokens: tf.RaggedTensor) -> tf.RaggedTensor: """Get the token embeddings for the input tokens. @@ -187,9 +194,18 @@ def get_embedding(self, tokens: tf.RaggedTensor) -> tf.RaggedTensor: Sequences of token embeddings with the same number of time steps as `tokens` """ - with tf.name_scope("TextToEmbedding"): - hashed = tf.ragged.map_flat_values(self.table.lookup, tokens) - return self.embed_layer.get_embedding(hashed) + hashed = tf.ragged.map_flat_values(self.table.lookup, tokens) + return self.embed_layer.get_embedding(hashed) + + def get_config(self): + base_config = super().get_config() + return { + **base_config, + "token_hash": self.lookup, + "embedding_size": int(tf.shape(self.embeddings)[1].numpy()), + "max_sequence_len": int(self.embed_layer.max_len.numpy()), + "unknown_token": self.unknown_token + } @property def slicer(self): diff --git a/text2vec/models/components/utils.py b/text2vec/models/components/utils.py index 77e5165..837c4d1 100644 --- a/text2vec/models/components/utils.py +++ b/text2vec/models/components/utils.py @@ -28,7 +28,7 @@ class LayerNorm(layers.Layer): """ def __init__(self, epsilon: float = 1e-8, scale: float = 1.0, bias: float = 0): - super().__init__(name="LayerNorm") + super().__init__() self.epsilon = tf.constant(epsilon, dtype=tf.float32) self.scale = tf.constant(scale, dtype=tf.float32) self.bias = tf.constant(bias, dtype=tf.float32) @@ -60,7 +60,7 @@ class TensorProjection(layers.Layer): """ def __init__(self): - super().__init__(name="TensorProjection") + super().__init__() def call(self, x, projection_vector): projection_vector = tf.math.l2_normalize(projection_vector, axis=-1)