📦 NERInference fix

raynardj · Nov 16, 2021 · 9bb4a79 · 9bb4a79
1 parent 425a41f
commit 9bb4a79
Show file tree

Hide file tree

Showing 6 changed files with 413 additions and 108 deletions.
diff --git a/docs/pl_training.html b/docs/pl_training.html
@@ -33,6 +33,13 @@
 
 <div class="cell border-box-sizing code_cell rendered">
 
+</div>
+    {% endraw %}
+
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
 </div>
     {% endraw %}
 
@@ -71,7 +78,7 @@ <h2 id="Load-model-and-tokenizer">Load model and tokenizer<a class="anchor-link"
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L20" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
+<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L31" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
 </blockquote>
 <p>name: from_pretrain(name)</p>
 
@@ -96,7 +103,7 @@ <h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="h
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L33" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
+<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L44" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
 </blockquote>
 
 </div>
@@ -133,7 +140,7 @@ <h2 id="Lightning-data-module">Lightning data module<a class="anchor-link" href=
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L42" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
+<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L53" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
 </blockquote>
 <p>A DataModule standardizes the training, val, test splits, data preparation and transforms.
 The main advantage is consistent data splits, data preparation and transforms across models.</p>
@@ -202,7 +209,7 @@ <h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L58" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
+<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L69" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
 </blockquote>
 <p>PyTorch lightning module for training ner model</p>
 
@@ -240,7 +247,7 @@ <h2 id="Enhance-pipeline">Enhance pipeline<a class="anchor-link" href="#Enhance-
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L107" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
+<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L118" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
 </blockquote>
 <p>Cleaning output for NER task</p>
 
@@ -258,47 +265,6 @@ <h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a hre
 
 <div class="cell border-box-sizing code_cell rendered">
 
-<div class="output_wrapper">
-<div class="output">
-
-<div class="output_area">
-
-
-<div class="output_markdown rendered_html output_subarea ">
-<h4 id="predict_ner_table" class="doc_header"><code>predict_ner_table</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L146" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>predict_ner_table</code>(<strong><code>pipeline_kw</code></strong>)</p>
-</blockquote>
-
-</div>
-
-</div>
-
-</div>
-</div>
-
-</div>
-    {% endraw %}
-
-    {% raw %}
-
-<div class="cell border-box-sizing code_cell rendered">
-
-<div class="output_wrapper">
-<div class="output">
-
-<div class="output_area">
-
-
-<div class="output_markdown rendered_html output_subarea ">
-<h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L155" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_pipeline</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
-</blockquote>
-
-</div>
-
-</div>
-
-</div>
-</div>
-
 </div>
     {% endraw %}
 
@@ -313,8 +279,11 @@ <h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</cod
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="refined_ner_from_pretrained" class="doc_header"><code>refined_ner_from_pretrained</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L165" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_from_pretrained</code>(<strong><code>pretrained</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
+<h2 id="NERInference" class="doc_header"><code>class</code> <code>NERInference</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L158" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERInference</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong><code>name</code></strong>=<em><code>None</code></em>)</p>
 </blockquote>
+<p>NER Inference pipeline
+ner = NERInference.from_pretrained('xxxx/xxxx')
+ner.predict(['text1','text2'])</p>
 
 </div>
 

diff --git a/forgebox/__init__.py b/forgebox/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.11"
+__version__ = "0.4.17"
diff --git a/forgebox/_nbdev.py b/forgebox/_nbdev.py
@@ -114,9 +114,7 @@
          "NERDataModule": "72_pl_training.ipynb",
          "NERModule": "72_pl_training.ipynb",
          "clean_ner_output": "72_pl_training.ipynb",
-         "predict_ner_table": "72_pl_training.ipynb",
-         "refined_ner_pipeline": "72_pl_training.ipynb",
-         "refined_ner_from_pretrained": "72_pl_training.ipynb",
+         "NERInference": "72_pl_training.ipynb",
          "CudaDevice": "CUDA_GPU_Management.ipynb",
          "CudaHandler": "CUDA_GPU_Management.ipynb",
          "MLMVisualizer": "bert_visualize.ipynb",

diff --git a/forgebox/hf/train.py b/forgebox/hf/train.py
@@ -1,18 +1,29 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified).
 
-__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output',
-           'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained']
+__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output', 'NERInference']
 
 # Cell
 from .data import IOBES
 from ..imports import *
+from ..loop import chunkify
 import pytorch_lightning as pl
 from transformers import (
     AutoModelForTokenClassification,
     AutoTokenizer,
     pipeline
 )
-from typing import Callable
+from tqdm.notebook import tqdm
+from typing import Callable, List
+from torch import device
+
+# Cell
+try:
+    ishell = get_ipython()
+    IS_JUPYTER = True
+    from tqdm.notebook import tqdm
+except NameError:
+    IS_JUPYTER = False
+    from tqdm import tqdm
 
 # Cell
 
@@ -113,12 +124,12 @@ def clean_ner_output(self, outputs):
     last_idx = 0
     # make to sub group by position
     for output in outputs:
-        if output["index"]-1 == last_idx:
+        if output["start"] in [last_idx, last_idx-1]:
             current.append(output)
         else:
             results.append(current)
             current = [output, ]
-        last_idx = output["index"]
+        last_idx = output["end"]
     if len(current) > 0:
         results.append(current)
 
@@ -139,30 +150,177 @@ def clean_ner_output(self, outputs):
                 word=new_str,
                 start=min(starts),
                 end=max(ends),
-                entity=c[0]['entity']
+                entity=c[0]['entity_group']
             ))
     return strings
 
-def predict_ner_table(pipeline_kw):
-    def predict_ner_table_(self, text: str) -> pd.DataFrame:
-        return pd.DataFrame(
-            self.clean_output(
-                self(text, **pipeline_kw)
-                )
-            )
-    return predict_ner_table_
-
-def refined_ner_pipeline(model, tokenizer, **pipeline_kw):
-    if "aggregation_strategy" not in pipeline_kw:
-        pipeline_kw["aggregation_strategy"] = "first"
-
-    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
-
-    ner_pipeline.__class__.clean_output = clean_ner_output
-    ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)
-    return ner_pipeline
-
-def refined_ner_from_pretrained(pretrained, **pipeline_kw):
-    model = AutoModelForTokenClassification.from_pretrained(pretrained)
-    tokenizer = AutoTokenizer.from_pretrained(pretrained)
-    return refined_ner_pipeline(model, tokenizer, **pipeline_kw)
+# Cell
+class NERInference:
+    """
+    NER Inference pipeline
+    ner = NERInference.from_pretrained('xxxx/xxxx')
+    ner.predict(['text1','text2'])
+    """
+
+    def __init__(self, model, tokenizer, name=None):
+        super().__init__()
+        self.model = model.eval()
+        self.tokenizer = tokenizer
+        self.name = name if name else "NER model"
+
+    def __repr__(self):
+        return f"[NERInference on {self.name}]"
+
+    def to(self, device_str):
+        self.model = self.model.to(device(device_str))
+        return self
+
+    @classmethod
+    def from_pretrained(cls, tag):
+        """
+        Load from pretrained model and tokenizer
+        """
+        model = AutoModelForTokenClassification.from_pretrained(tag)
+        tokenizer = AutoTokenizer.from_pretrained(tag)
+        return cls(model=model, tokenizer=tokenizer, name=model.config._name_or_path)
+
+    def __call__(self, data, batch_size=32, dev=device("cpu")):
+        if type(data) == str:
+            return self.batch_predict([data,])
+        else:
+            return self.predict(data, dev=dev, batch_size=batch_size)
+
+    def predict(
+        self,
+        texts: List[str],
+        dev=device("cpu"),
+        batch_size: int = 32,
+        progress_bar: bool = True
+    ) -> pd.DataFrame:
+        """
+        Predict a list of sentences/ paragraphs
+        """
+        # place the model into device
+        self.model = self.model.to(dev)
+        iterator = list(enumerate(chunkify(texts, bs=batch_size)))
+        if progress_bar:
+            iterator = tqdm(iterator, leave=False)
+
+        # run through iterator
+        all_dfs = []
+        for i, text_b in iterator:
+            # by batch prediction
+            batch_df = self.batch_predict(text_b)
+            if len(batch_df) > 0:
+                # calculate the row number
+                batch_df['text_id'] = batch_df.apply(
+                    lambda row: i*batch_size+row.batch_row_sn, axis=1)
+                all_dfs.append(batch_df)
+
+        # place the model back to cpu
+        self.model = self.model.to("cpu")
+        return pd.concat(all_dfs).reset_index(drop=True)
+
+    def tokenizing(self, texts):
+        inputs = self.tokenizer(
+            texts,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_attention_mask=True,
+            return_tensors='pt', truncation=True, return_offsets_mapping=True
+        ).to(self.model.device)
+        return inputs
+
+
+    def batch_predict(self, texts:List[str])-> pd.DataFrame:
+        """
+        Predict a single batch of sentences
+        """
+        id2label = self.model.config.id2label
+        inputs = self.tokenizing(texts)
+
+        with torch.no_grad():
+            outputs = self.model(input_ids=inputs.input_ids,
+                                 attention_mask=inputs.attention_mask)
+        inputs = inputs.to(device('cpu'))
+
+        pred_idx = outputs.logits.argmax(-1).to(device("cpu"))
+        batch_size = pred_idx.size(0)
+        offsets = inputs.offset_mapping
+        results = []
+        for bi in range(batch_size):
+            text = texts[bi]
+            input_ids = inputs.input_ids[bi]
+            word_ids = inputs.word_ids(bi)
+            pred_ids = pred_idx[bi]
+            # initial  values for the row
+            last_pos = 0
+            previous_has_positive = False
+            current_start = 0
+            current_index = 0
+            current_id = 0
+            line = []
+            for ti in range(1, len(input_ids)):
+                if input_ids[ti] == self.tokenizer.sep_token_id:
+                    break
+                # is the current token an appending sub-word?
+                if word_ids[ti] == last_pos:
+                    pass
+                # is current token negative
+                elif pred_ids[ti].item() == 0:
+                    # store the previous hanging prediction
+                    if previous_has_positive:
+                        start = current_start
+                        end = offsets[bi, ti, 0].item()
+                        line.append({
+                            "start": start, "end": end,
+                            "entity": id2label[current_id],
+                            "word": text[start:end],
+                            "index": current_index,
+                        })
+
+                    current_start = offsets[bi, ti, 0].item()
+                    previous_has_positive = False
+                    current_id = 0
+                    current_index = ti
+                # has positive prediction index, other than zero
+                else:
+                    if previous_has_positive:
+                        # different than the previous
+                        if current_id != pred_ids[ti].item():
+                            start = current_start
+                            end = offsets[bi, ti, 0].item()
+                            line.append({
+                                        "start": start,
+                                        "end": end,
+                                        "entity": id2label[current_id],
+                                        "word": text[start:end],
+                                        "index": current_index,
+                                        })
+                            current_start = offsets[bi, ti, 0].item()
+                    # this is the 1st postive predict for a while
+                    else:
+                        current_start = offsets[bi, ti, 0].item()
+                    previous_has_positive = True
+                    current_index = ti
+                    current_id = pred_ids[ti].item()
+
+                last_pos = word_ids[ti]
+            if previous_has_positive:
+                start = current_start
+                end = offsets[bi, ti, 1].item()
+                line.append({
+                            "start": start,
+                            "end": end,
+                            "entity": id2label[current_id],
+                            "word": text[start:end],
+                            "index": current_index,
+                            })
+
+            results.append(line)
+        all_dfs = []
+        for i, res in enumerate(results):
+            sub_df = pd.DataFrame(res)
+            sub_df["batch_row_sn"] = i
+            all_dfs.append(sub_df)
+        return pd.concat(all_dfs)