From 3b974a5a82d03376999a5e4b8f70997b404f900f Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 7 Jul 2022 15:33:42 -0400 Subject: [PATCH] updated demo, remove cruft --- README.md | 13 +- demo/api.py | 3 - examples/trainers/news_transformer.py | 4 +- setup.py | 2 - text2vec/bin/__init__.py | 0 text2vec/bin/main.py | 266 -------------------------- text2vec/bin/serving_tools.py | 53 ----- text2vec/bin/text_summarize.py | 232 ---------------------- text2vec/bin/utils.py | 26 --- 9 files changed, 10 insertions(+), 589 deletions(-) delete mode 100644 text2vec/bin/__init__.py delete mode 100644 text2vec/bin/main.py delete mode 100644 text2vec/bin/serving_tools.py delete mode 100644 text2vec/bin/text_summarize.py delete mode 100644 text2vec/bin/utils.py diff --git a/README.md b/README.md index 592eaf9..1b808ae 100644 --- a/README.md +++ b/README.md @@ -130,14 +130,17 @@ Text2vec includes a Python API with convenient classes for handling attention an ## Inference Demo --- -Once a model is fully trained then a demo API can be run, along with a small UI to interact with the REST API. This demo attempts to use the trained model to condense long bodies of text into the most important sentences, using the inferred embedded context vectors. - +Trained text2vec models can be demonstrated from a lightweight app included in this repository. The demo runs extractive summarization from long bodies of text using the attention vectors of the encoding latent space. To get started, you will need to clone the repository and then install additional dependencies: +```bash +git clone https://github.com/brainsqueeze/text2vec.git +cd text2vec +pip install flask tornado +``` To start the model server simply run ```bash -text2vec_main --run=infer --yaml_config=/path/to/config.yml +python demo/api.py --model_dir /absolute/saved_model/parent/dir ``` -A demonstration webpage is included in [demo](demo) at -[context.html](demo/context.html). +The `model_dir` CLI parameter must be an absolute path to the directory containing the `/saved_model` folder and the `tokenizer.json` file from a text2vec model with an `embed` signature. A demonstration app is served on port 9090. ## References --- diff --git a/demo/api.py b/demo/api.py index 8bbd9c2..e238f6b 100644 --- a/demo/api.py +++ b/demo/api.py @@ -5,13 +5,11 @@ import re from flask import Flask, request, Response, send_from_directory -from flask_cors import cross_origin from tornado.log import enable_pretty_logging from tornado.httpserver import HTTPServer from tornado.wsgi import WSGIContainer from tornado.ioloop import IOLoop import tornado.autoreload -# from tornado import web import tornado import tensorflow as tf @@ -79,7 +77,6 @@ def root(): @app.route("/summarize", methods=["GET", "POST"]) -# @cross_origin(origins=['*'], allow_headers=['Content-Type', 'Authorization']) def summarize(): if request.is_json: payload = request.json diff --git a/examples/trainers/news_transformer.py b/examples/trainers/news_transformer.py index 5d8777b..95941f2 100644 --- a/examples/trainers/news_transformer.py +++ b/examples/trainers/news_transformer.py @@ -37,7 +37,7 @@ def train_tokenizer() -> Tuple[tokenizers.Tokenizer, tf.data.Dataset]: pre_tokenizers.Digits(individual_digits=False) ]) - dataset = datasets.load_dataset("multi_news", split="train") + dataset = datasets.load_dataset("multi_news", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): @@ -126,7 +126,7 @@ def main(save_path: str): ) ) ], - epochs=10 + epochs=2 ) model.save( diff --git a/setup.py b/setup.py index 117b590..7d85d2a 100644 --- a/setup.py +++ b/setup.py @@ -18,8 +18,6 @@ extras_require=dict( serving=[ "flask", - "flask-cors", - "nltk", "tornado" ] ), diff --git a/text2vec/bin/__init__.py b/text2vec/bin/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/text2vec/bin/main.py b/text2vec/bin/main.py deleted file mode 100644 index 1086eca..0000000 --- a/text2vec/bin/main.py +++ /dev/null @@ -1,266 +0,0 @@ -import itertools -import argparse -import os - -import yaml - -import numpy as np -import tensorflow as tf -from tensorboard.plugins import projector - -from text2vec.training_tools import EncodingModel -from text2vec.training_tools import ServingModel -from text2vec.training_tools import sequence_cost -from text2vec.training_tools import vector_cost -from text2vec.optimizer_tools import RampUpDecaySchedule -from text2vec.preprocessing.text import clean_and_split -from text2vec.preprocessing import utils as data_tools -from . import utils - -root = os.path.dirname(os.path.abspath(__file__)) -os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' - - -def train(model_folder, num_tokens=10000, embedding_size=256, num_hidden=128, max_allowed_seq=-1, - layers=8, batch_size=32, num_epochs=10, data_files=None, model_path=".", use_attention=False, - eval_sentences=None, orthogonal_cost=False): - """Core training algorithm. - - Parameters - ---------- - model_folder : str - Name of the folder to create for the trained model - num_tokens : int, optional - Number of vocab tokens to keep from the training corpus, by default 10000 - embedding_size : int, optional - Size of the word-embedding dimensions, by default 256 - num_hidden : int, optional - Number of hidden model dimensions, by default 128 - max_allowed_seq : int, optional - The maximum sequence length allowed, model will truncate if longer, by default -1 - layers : int, optional - Number of multi-head attention mechanisms for transformer model, by default 8 - batch_size : int, optional - Size of each mini-batch, by default 32 - num_epochs : int, optional - Number of training epochs, by default 10 - data_files : list, optional - List of absolute paths to training data sets, by default None - model_path : str, optional - Valid path to where the model will be saved, by default "." - use_attention : bool, optional - Set to True to use the self-attention only model, by default False - eval_sentences : List, optional - List of sentences to check the context angles, by default None - orthogonal_cost : bool, optional - Set to True to add a cost to mutually parallel context vector, by default False - - Returns - ------- - str - Model checkpoint file path. - """ - - # GPU config - for gpu in tf.config.experimental.list_physical_devices('GPU'): - # tf.config.experimental.set_memory_growth(gpu, True) - tf.config.experimental.set_memory_growth(gpu, False) - tf.config.set_soft_device_placement(True) - - log_dir = f"{model_path}/{model_folder}" if model_path else f"{root}/../../text2vec/{model_folder}" - if not os.path.exists(log_dir): - os.mkdir(log_dir) - - utils.log("Fetching corpus and creating data pipeline") - corpus = data_tools.load_text_files(data_files=data_files, max_length=max_allowed_seq) - - utils.log("Fitting embedding lookup", end="...") - hash_map, max_seq_len, train_set_size = data_tools.get_top_tokens(corpus, n_top=num_tokens) - print(f"{train_set_size} sentences. max sequence length: {max_seq_len}") - - with open(log_dir + "/metadata.tsv", "w") as tsv: - for token, _ in sorted(hash_map.items(), key=lambda s: s[-1]): - # since tensorflow converts strings to byets we will decode from UTF-8 here for display purposes - tsv.write(f"{token.decode('utf8', 'replace')}\n") - tsv.write("\n") - - utils.log("Building computation graph") - log_step = (train_set_size // batch_size) // 25 - dims = embedding_size - - params = dict( - max_sequence_len=max_seq_len, - embedding_size=dims, - input_keep_prob=0.9, - hidden_keep_prob=0.75 - ) - if use_attention: - model = EncodingModel(token_hash=hash_map, layers=layers, **params) - else: - model = EncodingModel(token_hash=hash_map, num_hidden=num_hidden, recurrent=True, **params) - - warmup_steps = max(train_set_size // batch_size, 4000) - learning_rate = RampUpDecaySchedule(embedding_size=dims, warmup_steps=warmup_steps) - optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) - train_loss = tf.keras.metrics.Mean('train-loss', dtype=tf.float32) - - def compute_loss(sentences): - y_hat, time_steps, targets, vectors = model(sentences, training=True, return_vectors=True) - loss_val = sequence_cost( - target_sequences=targets, - sequence_logits=y_hat[:, :time_steps], - num_labels=model.embed_layer.num_labels, - smoothing=False - ) - - if orthogonal_cost: - return loss_val + vector_cost(context_vectors=vectors) - return loss_val - - @tf.function(input_signature=[tf.TensorSpec(shape=(None,), dtype=tf.string)]) - def train_step(sentences): - loss_val = compute_loss(sentences) - gradients = tf.gradients(loss_val, model.trainable_variables) - optimizer.apply_gradients(zip(gradients, model.trainable_variables)) - train_loss(loss_val) # log the loss value to TensorBoard - - model_file_name = None - if isinstance(eval_sentences, list) and len(eval_sentences) > 1: - test_sentences = eval_sentences - else: - test_sentences = ["The movie was great!", "The movie was terrible."] - test_tokens = [' '.join(clean_and_split(text)) for text in test_sentences] - - summary_writer_train = tf.summary.create_file_writer(log_dir + "/training") - summary_writer_dev = tf.summary.create_file_writer(log_dir + "/validation") - checkpoint = tf.train.Checkpoint(EmbeddingModel=model, optimizer=optimizer) - checkpoint_manager = tf.train.CheckpointManager(checkpoint, log_dir, max_to_keep=5) - - # add word labels to the projector - config = projector.ProjectorConfig() - # pylint: disable=no-member - embeddings_config = config.embeddings.add() - - checkpoint_manager.save() - reader = tf.train.load_checkpoint(log_dir) - embeddings_config.tensor_name = [key for key in reader.get_variable_to_shape_map() if "embedding" in key][0] - embeddings_config.metadata_path = log_dir + "/metadata.tsv" - projector.visualize_embeddings(logdir=log_dir + "/training", config=config) - - step = 1 - for epoch in range(num_epochs): - try: - corpus = corpus.unbatch() - except ValueError: - print("Corpus not batched") - corpus = corpus.shuffle(train_set_size) - corpus = corpus.batch(batch_size).prefetch(10) # pre-fetch 10 batches for queuing - - print(f"\t Epoch: {epoch + 1}") - i = 1 - train_loss.reset_states() - - for x in corpus: - if step == 1: - tf.summary.trace_on(graph=True, profiler=False) - - train_step(x) - with summary_writer_train.as_default(): - if step == 1: - tf.summary.trace_export(name='graph', step=1, profiler_outdir=log_dir) - tf.summary.trace_off() - summary_writer_train.flush() - - if i % log_step == 0: - print(f"\t\t iteration {i} - loss: {train_loss.result()}") - tf.summary.scalar(name='loss', data=train_loss.result(), step=step) - tf.summary.scalar(name='learning-rate', data=learning_rate.callback(step=step), step=step) - summary_writer_train.flush() - train_loss.reset_states() - i += 1 - step += 1 - - vectors = model.embed(test_tokens) - angles = utils.compute_angles(vectors.numpy()) - - with summary_writer_dev.as_default(): - for idx, (i, j) in enumerate(itertools.combinations(range(len(test_sentences)), r=2), start=1): - angle = angles[i, j] - print(f"The angle between '{test_sentences[i]}' and '{test_sentences[j]}' is {angle} degrees") - - # log the angle to tensorboard - desc = f"'{test_sentences[i]}' : '{test_sentences[j]}'" - tf.summary.scalar(f'similarity-angle/{idx}', angle, step=step, description=desc) - summary_writer_dev.flush() - model_file_name = checkpoint_manager.save() - - utils.log("Saving a frozen model") - serve_model_ = ServingModel(embed_layer=model.embed_layer, encode_layer=model.encode_layer, sep=' ') - tf.saved_model.save( - obj=serve_model_, - export_dir=f"{log_dir}/frozen/1", - signatures={"serving_default": serve_model_.embed, "token_embed": serve_model_.token_embed} - ) - - utils.log("Reloading frozen model and comparing output to in-memory model") - test = tf.saved_model.load(f"{log_dir}/frozen/1") - test_model = test.signatures["serving_default"] - test_output = test_model(tf.constant(test_tokens))["output_0"] - utils.log(f"Outputs on CV set are approximately the same?: {np.allclose(test_output, model.embed(test_tokens))}") - return model_file_name - - -def main(): - """Training and inferencing entrypoint for CLI. - - Raises - ------ - NotImplementedError - Raised if a `run` mode other than `train` or `infer` are passed. - """ - - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--run", choices=["train", "infer"], help="Run type.", required=True) - parser.add_argument("--attention", action='store_true', help="Set to use attention transformer model.") - parser.add_argument("--orthogonal", action='store_true', help="Set to add a cost to mutually parallel contexts.") - parser.add_argument("--yaml_config", type=str, help="Path to a valid training config YAML file.", required=True) - args = parser.parse_args() - - config_path = args.yaml_config - if config_path.startswith("${HOME}"): - config_path = config_path.replace('${HOME}', os.getenv('HOME')) - elif config_path.startswith("$HOME"): - config_path = config_path.replace('$HOME', os.getenv('HOME')) - - config = yaml.safe_load(open(config_path, 'r')) - training_config = config.get("training", {}) - model_config = config.get("model", {}) - model_params = model_config.get("parameters", {}) - - if args.run == "train": - train( - model_folder=model_config["name"], - use_attention=args.attention, - num_tokens=training_config.get("tokens", 10000), - max_allowed_seq=training_config.get("max_sequence_length", 512), - embedding_size=model_params.get("embedding", 128), - num_hidden=model_params.get("hidden", 128), - layers=model_params.get("layers", 8), - batch_size=training_config.get("batch_size", 32), - num_epochs=training_config.get("epochs", 20), - data_files=training_config.get("data_files"), - model_path=model_config.get("storage_dir", "."), - eval_sentences=training_config.get("eval_sentences"), - orthogonal_cost=args.orthogonal - ) - elif args.run == "infer": - os.environ["MODEL_PATH"] = f'{model_config.get("storage_dir", ".")}/{model_config["name"]}' - from .text_summarize import run_server - run_server(port=8008) - else: - raise NotImplementedError("Only training and inferencing is enabled right now.") - return - - -if __name__ == '__main__': - main() diff --git a/text2vec/bin/serving_tools.py b/text2vec/bin/serving_tools.py deleted file mode 100644 index 8305c13..0000000 --- a/text2vec/bin/serving_tools.py +++ /dev/null @@ -1,53 +0,0 @@ -import os - -from nltk.tokenize import sent_tokenize - -import tensorflow as tf -from text2vec.preprocessing.text import normalize_text, clean_and_split - - -class Embedder(): - """Wrapper class which handles contextual embedding of documents. - """ - - def __init__(self): - log_dir = f"{os.environ['MODEL_PATH']}/frozen/1" - self.__model = tf.saved_model.load(log_dir) - - @staticmethod - def __get_sentences(text): - data = [(sent, ' '.join(clean_and_split(normalize_text(sent)))) for sent in sent_tokenize(text)] - data = [(orig, clean) for orig, clean in data if len(clean.split()) >= 5] - original, clean = map(list, zip(*data)) - return original, clean - - def __normalize(self, vectors: tf.Tensor): - return tf.math.l2_normalize(vectors, axis=-1).numpy() - - def __doc_vector(self, doc: tf.Tensor): - net_vector = tf.reduce_sum(doc, axis=0) - return self.__normalize(net_vector) - - def __embed(self, corpus: list): - return self.__model.embed(corpus) - - def embed(self, text: str): - """String preparation and embedding. Returns the context vector representing the input document. - - Parameters - ---------- - text : str - - Returns - ------- - (list, tf.Tensor, tf.Tensor) - ( - Segmented sentences, - L2-normalized context vectors (num_sentences, embedding_size), - Single unit vector representing the entire document (embedding_size,) - ) - """ - - sentences, clean_sentences = self.__get_sentences(text) - vectors = self.__embed(clean_sentences) - return sentences, self.__normalize(vectors), self.__doc_vector(vectors) diff --git a/text2vec/bin/text_summarize.py b/text2vec/bin/text_summarize.py deleted file mode 100644 index b9a6428..0000000 --- a/text2vec/bin/text_summarize.py +++ /dev/null @@ -1,232 +0,0 @@ -import time -import json - -from flask import Flask, request, Response -from flask_cors import cross_origin - -from tornado.httpserver import HTTPServer -from tornado.wsgi import WSGIContainer -from tornado.ioloop import IOLoop -import tornado.autoreload -import tornado - -import numpy as np -from .serving_tools import Embedder - -app = Flask(__name__) -model = Embedder() - - -def responder(results, error, message): - """Boilerplate Flask response item. - - Parameters - ---------- - results : dict - API response - error : int - Error code - message : str - Message to send to the client - - Returns - ------- - flask.Reponse - """ - - assert isinstance(results, dict) - results["message"] = message - results = json.dumps(results, indent=2) - - return Response( - response=results, - status=error, - mimetype="application/json" - ) - - -def cosine_similarity_sort(net_vector, embedding_matrix): - """ - Computes the cosine similarity scores and then returns - the sorted results - - Parameters - ---------- - net_vector : np.ndarray - The context vector for the entire document - embedding_matrix : np.ndarray - The context vectors (row vectors) for each constituent body of text - - Returns - ------- - (ndarray, ndarray) - (sorted order of documents, cosine similarity scores) - """ - - similarity = np.dot(embedding_matrix, net_vector) - similarity = np.clip(similarity, -1, 1) - # sort = np.argsort(1 - similarity) - sort = np.argsort(similarity - 1) - - return sort, similarity.flatten()[sort] - - -def angle_from_cosine(cosine_similarity): - """ - Computes the angles in degrees from cosine similarity scores - - Parameters - ---------- - cosine_similarity : np.ndarray - - Returns - ------- - ndarray - Cosine angles (num_sentences,) - """ - - return np.arccos(cosine_similarity) * (180 / np.pi) - - -def choose(sentences, scores, embeddings): - """ - Selects the best constituent texts from the similarity scores - - Parameters - ---------- - sentences : np.ndarray - Array of the input texts, sorted by scores. - scores : np.ndarray - Cosine similarity scores, sorted - embeddings : np.ndarray - Embedding matrix for input texts, sorted by scores - - Returns - ------- - (np.ndarray, np.ndarray, np.ndarray) - (best sentences sorted, best scores sorted, best embeddings sorted) - """ - - if scores.shape[0] == 1: - return sentences, scores, embeddings - - angles = angle_from_cosine(scores) - cut = angles < angles.mean() - angles.std() - return sentences[cut], scores[cut], embeddings[cut] - - -def text_pass_filter(texts, texts_embeddings, net_vector): - """ - Runs the scoring + filtering process on input texts - - Parameters - ---------- - texts : np.ndarray - Input texts. - texts_embeddings : np.ndarray - Context embedding matrix for input texts. - net_vector : np.ndarray - The context vector for the entire document - - Returns - ------- - (np.ndarray, np.ndarray, np.ndarray) - (best sentences sorted, best scores sorted, best embeddings sorted) - """ - - sorted_order, scores = cosine_similarity_sort(net_vector=net_vector, embedding_matrix=texts_embeddings) - texts = np.array(texts)[sorted_order] - filtered_texts, filtered_scores, filtered_embeddings = choose( - sentences=texts, - scores=scores, - embeddings=texts_embeddings[sorted_order] - ) - - return filtered_texts, filtered_scores, filtered_embeddings - - -def softmax(logits): - """ - Computes the softmax of the input logits. - - Parameters - ---------- - logits : np.ndarray - - Returns - ------- - np.ndarray - Softmax output array with the same shape as the input. - """ - - soft = np.exp(logits) - soft[np.isinf(soft)] = 1e10 - soft /= np.sum(soft, axis=0) - soft = np.clip(soft, 0.0, 1.0) - return soft - - -@app.route('/condense', methods=['POST', 'GET']) -@cross_origin(origins=['*'], allow_headers=['Content-Type', 'Authorization']) -def compute(): - """ - Main Flask handler function - - Returns - ------- - flask.Response - """ - - j = request.get_json() - if j is None: - j = request.args - if not j: - j = request.form - - st = time.time() - body = j.get("body", "") - if not body: - results = { - "elapsed_time": time.time() - st, - "data": None - } - return responder(results=results, error=400, message="No text provided") - - # get the embedding vectors for each sentence in the document - sentences, vectors, doc_vector = model.embed(body) - top_sentences, top_scores, _ = text_pass_filter(texts=sentences, texts_embeddings=vectors, net_vector=doc_vector) - - results = { - "elapsed_time": time.time() - st, - "data": [{ - "text": text, - "relevanceScore": score - } for text, score in zip(top_sentences, top_scores.astype(float))] - } - return responder(results=results, error=200, message="Success") - - -def run_server(port=8008): - """This initializes the Tornad WSGI server to allow robust request handling. - - Parameters - ---------- - port : int, optional - Port number to serve the app on, by default 8008 - """ - - http_server = HTTPServer(WSGIContainer(app)) - http_server.listen(port) - - io_loop = IOLoop.instance() - tornado.autoreload.start(check_time=500) - print("Listening to port", port) - - try: - io_loop.start() - except KeyboardInterrupt: - pass - - -if __name__ == '__main__': - run_server(port=8008) diff --git a/text2vec/bin/utils.py b/text2vec/bin/utils.py deleted file mode 100644 index 57ac1ea..0000000 --- a/text2vec/bin/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - - -def log(message, **kwargs): - print(f"[INFO] {message}", flush=True, end=kwargs.get("end", "\n")) - - -def compute_angles(vectors): - """Computes the angles between vectors - - Parameters - ---------- - vectors : np.ndarray - (batch_size, embedding_size) - - Returns - ------- - np.ndarray - Cosine angles in degrees (batch_size, batch_size) - """ - - vectors /= np.linalg.norm(vectors, axis=1, keepdims=True) - cosine = np.dot(vectors, vectors.T) - cosine = np.clip(cosine, -1, 1) - degrees = np.arccos(cosine) * (180 / np.pi) - return degrees