🎻 ner inference pipeline

raynardj · Nov 5, 2021 · cac6b26 · cac6b26
1 parent 3b8a702
commit cac6b26
Show file tree

Hide file tree

Showing 9 changed files with 276 additions and 122 deletions.
diff --git a/docs/hf_transformer_data.html b/docs/hf_transformer_data.html
@@ -135,31 +135,6 @@ <h2 id="IOBES" class="doc_header"><code>class</code> <code>IOBES</code><a href="
 
 <div class="cell border-box-sizing code_cell rendered">
 
-<div class="output_wrapper">
-<div class="output">
-
-<div class="output_area">
-
-
-<div class="output_markdown rendered_html output_subarea ">
-<h4 id="clean_output" class="doc_header"><code>clean_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/data.py#L220" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_output</code>(<strong><code>outputs</code></strong>)</p>
-</blockquote>
-<p>Cleaning output for NER task</p>
-
-</div>
-
-</div>
-
-</div>
-</div>
-
-</div>
-    {% endraw %}
-
-    {% raw %}
-
-<div class="cell border-box-sizing code_cell rendered">
-
 </div>
     {% endraw %}
 

diff --git a/docs/pl_training.html b/docs/pl_training.html
@@ -71,7 +71,7 @@ <h2 id="Load-model-and-tokenizer">Load model and tokenizer<a class="anchor-link"
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L14" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
+<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L20" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
 </blockquote>
 <p>name: from_pretrain(name)</p>
 
@@ -96,7 +96,7 @@ <h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="h
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L27" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
+<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L33" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
 </blockquote>
 
 </div>
@@ -133,7 +133,7 @@ <h2 id="Lightning-data-module">Lightning data module<a class="anchor-link" href=
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L36" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
+<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L42" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
 </blockquote>
 <p>A DataModule standardizes the training, val, test splits, data preparation and transforms.
 The main advantage is consistent data splits, data preparation and transforms across models.</p>
@@ -202,7 +202,7 @@ <h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule
 
 
 <div class="output_markdown rendered_html output_subarea ">
-<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L52" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
+<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L58" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
 </blockquote>
 <p>PyTorch lightning module for training ner model</p>
 
@@ -220,6 +220,116 @@ <h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><
 
 <div class="cell border-box-sizing code_cell rendered">
 
+</div>
+    {% endraw %}
+
+<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Enhance-pipeline">Enhance pipeline<a class="anchor-link" href="#Enhance-pipeline"> </a></h2>
+</div>
+</div>
+</div>
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L107" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
+</blockquote>
+<p>Cleaning output for NER task</p>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="predict_ner_table" class="doc_header"><code>predict_ner_table</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L146" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>predict_ner_table</code>(<strong><code>pipeline_kw</code></strong>)</p>
+</blockquote>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L155" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_pipeline</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
+</blockquote>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_markdown rendered_html output_subarea ">
+<h4 id="refined_ner_from_pretrained" class="doc_header"><code>refined_ner_from_pretrained</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L165" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_from_pretrained</code>(<strong><code>pretrained</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
+</blockquote>
+
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+    {% endraw %}
+
+    {% raw %}
+
+<div class="cell border-box-sizing code_cell rendered">
+
 </div>
     {% endraw %}
 

diff --git a/forgebox/__init__.py b/forgebox/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.9"
+__version__ = "0.4.11"
diff --git a/forgebox/_nbdev.py b/forgebox/_nbdev.py
@@ -109,11 +109,14 @@
          "convert_iob2_file_to_iobes": "70_hf_transformer_data.ipynb",
          "conbine_iobes_file": "70_hf_transformer_data.ipynb",
          "IOBES": "70_hf_transformer_data.ipynb",
-         "clean_output": "70_hf_transformer_data.ipynb",
          "ner_model_from": "72_pl_training.ipynb",
          "ner_tokenizer_from": "72_pl_training.ipynb",
          "NERDataModule": "72_pl_training.ipynb",
          "NERModule": "72_pl_training.ipynb",
+         "clean_ner_output": "72_pl_training.ipynb",
+         "predict_ner_table": "72_pl_training.ipynb",
+         "refined_ner_pipeline": "72_pl_training.ipynb",
+         "refined_ner_from_pretrained": "72_pl_training.ipynb",
          "CudaDevice": "CUDA_GPU_Management.ipynb",
          "CudaHandler": "CUDA_GPU_Management.ipynb",
          "MLMVisualizer": "bert_visualize.ipynb",

diff --git a/forgebox/hf/data.py b/forgebox/hf/data.py
@@ -1,6 +1,6 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/70_hf_transformer_data.ipynb (unless otherwise specified).
 
-__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES', 'clean_output']
+__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES']
 
 # Cell
 from ..imports import *
@@ -214,44 +214,4 @@ def set_hfconfig(self, config):
         """
         config.num_labels = len(self.cates)
         config.id2label = {i: label for i, label in enumerate(self.cates.i2c)}
-        config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}
-
-
-def clean_output(outputs):
-    """
-    Cleaning output for NER task
-    """
-    results = []
-    current = []
-    last_idx = 0
-    # make to sub group by position
-    for output in outputs:
-        if output["index"]-1 == last_idx:
-            current.append(output)
-        else:
-            results.append(current)
-            current = [output, ]
-        last_idx = output["index"]
-    if len(current) > 0:
-        results.append(current)
-
-    # from tokens to string
-    strings = []
-    for c in results:
-        tokens = []
-        starts = []
-        ends = []
-        for o in c:
-            tokens.append(o['word'])
-            starts.append(o['start'])
-            ends.append(o['end'])
-
-        new_str = tokenizer.convert_tokens_to_string(tokens)
-        if new_str != '':
-            strings.append(dict(
-                word=new_str,
-                start=min(starts),
-                end=max(ends),
-                entity=c[0]['entity']
-            ))
-    return strings
+        config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}
diff --git a/forgebox/hf/train.py b/forgebox/hf/train.py
@@ -1,12 +1,18 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified).
 
-__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule']
+__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output',
+           'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained']
 
 # Cell
 from .data import IOBES
 from ..imports import *
 import pytorch_lightning as pl
-from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    pipeline
+)
+from typing import Callable
 
 # Cell
 
@@ -95,4 +101,68 @@ def configure_optimizers(self):
             {'params': self.model.classifier.parameters(), 'lr': 1e-3},
         ]
         optimizer = torch.optim.Adam(param_groups, lr=1e-3)
-        return optimizer
+        return optimizer
+
+# Cell
+def clean_ner_output(self, outputs):
+    """
+    Cleaning output for NER task
+    """
+    results = []
+    current = []
+    last_idx = 0
+    # make to sub group by position
+    for output in outputs:
+        if output["index"]-1 == last_idx:
+            current.append(output)
+        else:
+            results.append(current)
+            current = [output, ]
+        last_idx = output["index"]
+    if len(current) > 0:
+        results.append(current)
+
+    # from tokens to string
+    strings = []
+    for c in results:
+        tokens = []
+        starts = []
+        ends = []
+        for o in c:
+            tokens.append(o['word'])
+            starts.append(o['start'])
+            ends.append(o['end'])
+
+        new_str = self.tokenizer.convert_tokens_to_string(tokens)
+        if new_str != '':
+            strings.append(dict(
+                word=new_str,
+                start=min(starts),
+                end=max(ends),
+                entity=c[0]['entity']
+            ))
+    return strings
+
+def predict_ner_table(pipeline_kw):
+    def predict_ner_table_(self, text: str) -> pd.DataFrame:
+        return pd.DataFrame(
+            self.clean_output(
+                self(text, **pipeline_kw)
+                )
+            )
+    return predict_ner_table_
+
+def refined_ner_pipeline(model, tokenizer, **pipeline_kw):
+    if "aggregation_strategy" not in pipeline_kw:
+        pipeline_kw["aggregation_strategy"] = "first"
+
+    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
+
+    ner_pipeline.__class__.clean_output = clean_ner_output
+    ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)
+    return ner_pipeline
+
+def refined_ner_from_pretrained(pretrained, **pipeline_kw):
+    model = AutoModelForTokenClassification.from_pretrained(pretrained)
+    tokenizer = AutoTokenizer.from_pretrained(pretrained)
+    return refined_ner_pipeline(model, tokenizer, **pipeline_kw)