Skip to content

Commit

Permalink
🎻 ner inference pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
raynardj committed Nov 5, 2021
1 parent 3b8a702 commit cac6b26
Show file tree
Hide file tree
Showing 9 changed files with 276 additions and 122 deletions.
25 changes: 0 additions & 25 deletions docs/hf_transformer_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -135,31 +135,6 @@ <h2 id="IOBES" class="doc_header"><code>class</code> <code>IOBES</code><a href="

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="clean_output" class="doc_header"><code>clean_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/data.py#L220" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_output</code>(<strong><code>outputs</code></strong>)</p>
</blockquote>
<p>Cleaning output for NER task</p>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

</div>
{% endraw %}

Expand Down
118 changes: 114 additions & 4 deletions docs/pl_training.html
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ <h2 id="Load-model-and-tokenizer">Load model and tokenizer<a class="anchor-link"


<div class="output_markdown rendered_html output_subarea ">
<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L14" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L20" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
</blockquote>
<p>name: from_pretrain(name)</p>

Expand All @@ -96,7 +96,7 @@ <h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="h


<div class="output_markdown rendered_html output_subarea ">
<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L27" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L33" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
</blockquote>

</div>
Expand Down Expand Up @@ -133,7 +133,7 @@ <h2 id="Lightning-data-module">Lightning data module<a class="anchor-link" href=


<div class="output_markdown rendered_html output_subarea ">
<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L36" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L42" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
</blockquote>
<p>A DataModule standardizes the training, val, test splits, data preparation and transforms.
The main advantage is consistent data splits, data preparation and transforms across models.</p>
Expand Down Expand Up @@ -202,7 +202,7 @@ <h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule


<div class="output_markdown rendered_html output_subarea ">
<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L52" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L58" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
</blockquote>
<p>PyTorch lightning module for training ner model</p>

Expand All @@ -220,6 +220,116 @@ <h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><

<div class="cell border-box-sizing code_cell rendered">

</div>
{% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="Enhance-pipeline">Enhance pipeline<a class="anchor-link" href="#Enhance-pipeline"> </a></h2>
</div>
</div>
</div>
{% raw %}

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L107" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
</blockquote>
<p>Cleaning output for NER task</p>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="predict_ner_table" class="doc_header"><code>predict_ner_table</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L146" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>predict_ner_table</code>(<strong><code>pipeline_kw</code></strong>)</p>
</blockquote>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L155" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_pipeline</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
</blockquote>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="refined_ner_from_pretrained" class="doc_header"><code>refined_ner_from_pretrained</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L165" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_from_pretrained</code>(<strong><code>pretrained</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
</blockquote>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

</div>
{% endraw %}

Expand Down
2 changes: 1 addition & 1 deletion forgebox/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.9"
__version__ = "0.4.11"
5 changes: 4 additions & 1 deletion forgebox/_nbdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,14 @@
"convert_iob2_file_to_iobes": "70_hf_transformer_data.ipynb",
"conbine_iobes_file": "70_hf_transformer_data.ipynb",
"IOBES": "70_hf_transformer_data.ipynb",
"clean_output": "70_hf_transformer_data.ipynb",
"ner_model_from": "72_pl_training.ipynb",
"ner_tokenizer_from": "72_pl_training.ipynb",
"NERDataModule": "72_pl_training.ipynb",
"NERModule": "72_pl_training.ipynb",
"clean_ner_output": "72_pl_training.ipynb",
"predict_ner_table": "72_pl_training.ipynb",
"refined_ner_pipeline": "72_pl_training.ipynb",
"refined_ner_from_pretrained": "72_pl_training.ipynb",
"CudaDevice": "CUDA_GPU_Management.ipynb",
"CudaHandler": "CUDA_GPU_Management.ipynb",
"MLMVisualizer": "bert_visualize.ipynb",
Expand Down
44 changes: 2 additions & 42 deletions forgebox/hf/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/70_hf_transformer_data.ipynb (unless otherwise specified).

__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES', 'clean_output']
__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES']

# Cell
from ..imports import *
Expand Down Expand Up @@ -214,44 +214,4 @@ def set_hfconfig(self, config):
"""
config.num_labels = len(self.cates)
config.id2label = {i: label for i, label in enumerate(self.cates.i2c)}
config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}


def clean_output(outputs):
"""
Cleaning output for NER task
"""
results = []
current = []
last_idx = 0
# make to sub group by position
for output in outputs:
if output["index"]-1 == last_idx:
current.append(output)
else:
results.append(current)
current = [output, ]
last_idx = output["index"]
if len(current) > 0:
results.append(current)

# from tokens to string
strings = []
for c in results:
tokens = []
starts = []
ends = []
for o in c:
tokens.append(o['word'])
starts.append(o['start'])
ends.append(o['end'])

new_str = tokenizer.convert_tokens_to_string(tokens)
if new_str != '':
strings.append(dict(
word=new_str,
start=min(starts),
end=max(ends),
entity=c[0]['entity']
))
return strings
config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}
76 changes: 73 additions & 3 deletions forgebox/hf/train.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified).

__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule']
__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output',
'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained']

# Cell
from .data import IOBES
from ..imports import *
import pytorch_lightning as pl
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
pipeline
)
from typing import Callable

# Cell

Expand Down Expand Up @@ -95,4 +101,68 @@ def configure_optimizers(self):
{'params': self.model.classifier.parameters(), 'lr': 1e-3},
]
optimizer = torch.optim.Adam(param_groups, lr=1e-3)
return optimizer
return optimizer

# Cell
def clean_ner_output(self, outputs):
"""
Cleaning output for NER task
"""
results = []
current = []
last_idx = 0
# make to sub group by position
for output in outputs:
if output["index"]-1 == last_idx:
current.append(output)
else:
results.append(current)
current = [output, ]
last_idx = output["index"]
if len(current) > 0:
results.append(current)

# from tokens to string
strings = []
for c in results:
tokens = []
starts = []
ends = []
for o in c:
tokens.append(o['word'])
starts.append(o['start'])
ends.append(o['end'])

new_str = self.tokenizer.convert_tokens_to_string(tokens)
if new_str != '':
strings.append(dict(
word=new_str,
start=min(starts),
end=max(ends),
entity=c[0]['entity']
))
return strings

def predict_ner_table(pipeline_kw):
def predict_ner_table_(self, text: str) -> pd.DataFrame:
return pd.DataFrame(
self.clean_output(
self(text, **pipeline_kw)
)
)
return predict_ner_table_

def refined_ner_pipeline(model, tokenizer, **pipeline_kw):
if "aggregation_strategy" not in pipeline_kw:
pipeline_kw["aggregation_strategy"] = "first"

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

ner_pipeline.__class__.clean_output = clean_ner_output
ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)
return ner_pipeline

def refined_ner_from_pretrained(pretrained, **pipeline_kw):
model = AutoModelForTokenClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)
return refined_ner_pipeline(model, tokenizer, **pipeline_kw)
Loading

0 comments on commit cac6b26

Please sign in to comment.