Skip to content

Commit

Permalink
📦 NERInference fix
Browse files Browse the repository at this point in the history
  • Loading branch information
raynardj committed Nov 16, 2021
1 parent 425a41f commit 9bb4a79
Show file tree
Hide file tree
Showing 6 changed files with 413 additions and 108 deletions.
63 changes: 16 additions & 47 deletions docs/pl_training.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@

<div class="cell border-box-sizing code_cell rendered">

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

</div>
{% endraw %}

Expand Down Expand Up @@ -71,7 +78,7 @@ <h2 id="Load-model-and-tokenizer">Load model and tokenizer<a class="anchor-link"


<div class="output_markdown rendered_html output_subarea ">
<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L20" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
<h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L31" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_model_from</code>(<strong><code>name</code></strong>:<code>str</code>, <strong><code>dataset</code></strong>:<a href="/forgebox/hf_transformer_data.html#IOBES"><code>IOBES</code></a>)</p>
</blockquote>
<p>name: from_pretrain(name)</p>

Expand All @@ -96,7 +103,7 @@ <h4 id="ner_model_from" class="doc_header"><code>ner_model_from</code><a href="h


<div class="output_markdown rendered_html output_subarea ">
<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L33" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
<h4 id="ner_tokenizer_from" class="doc_header"><code>ner_tokenizer_from</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L44" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>ner_tokenizer_from</code>(<strong><code>name</code></strong>:<code>str</code>)</p>
</blockquote>

</div>
Expand Down Expand Up @@ -133,7 +140,7 @@ <h2 id="Lightning-data-module">Lightning data module<a class="anchor-link" href=


<div class="output_markdown rendered_html output_subarea ">
<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L42" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
<h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L53" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERDataModule</code>(<strong>*<code>args</code></strong>:<code>Any</code>, <strong>**<code>kwargs</code></strong>:<code>Any</code>) :: <code>LightningDataModule</code></p>
</blockquote>
<p>A DataModule standardizes the training, val, test splits, data preparation and transforms.
The main advantage is consistent data splits, data preparation and transforms across models.</p>
Expand Down Expand Up @@ -202,7 +209,7 @@ <h2 id="NERDataModule" class="doc_header"><code>class</code> <code>NERDataModule


<div class="output_markdown rendered_html output_subarea ">
<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L58" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
<h2 id="NERModule" class="doc_header"><code>class</code> <code>NERModule</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L69" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERModule</code>(<strong><code>model</code></strong>) :: <code>LightningModule</code></p>
</blockquote>
<p>PyTorch lightning module for training ner model</p>

Expand Down Expand Up @@ -240,7 +247,7 @@ <h2 id="Enhance-pipeline">Enhance pipeline<a class="anchor-link" href="#Enhance-


<div class="output_markdown rendered_html output_subarea ">
<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L107" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
<h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L118" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>clean_ner_output</code>(<strong><code>outputs</code></strong>)</p>
</blockquote>
<p>Cleaning output for NER task</p>

Expand All @@ -258,47 +265,6 @@ <h4 id="clean_ner_output" class="doc_header"><code>clean_ner_output</code><a hre

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="predict_ner_table" class="doc_header"><code>predict_ner_table</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L146" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>predict_ner_table</code>(<strong><code>pipeline_kw</code></strong>)</p>
</blockquote>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

{% raw %}

<div class="cell border-box-sizing code_cell rendered">

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_markdown rendered_html output_subarea ">
<h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L155" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_pipeline</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
</blockquote>

</div>

</div>

</div>
</div>

</div>
{% endraw %}

Expand All @@ -313,8 +279,11 @@ <h4 id="refined_ner_pipeline" class="doc_header"><code>refined_ner_pipeline</cod


<div class="output_markdown rendered_html output_subarea ">
<h4 id="refined_ner_from_pretrained" class="doc_header"><code>refined_ner_from_pretrained</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L165" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>refined_ner_from_pretrained</code>(<strong><code>pretrained</code></strong>, <strong>**<code>pipeline_kw</code></strong>)</p>
<h2 id="NERInference" class="doc_header"><code>class</code> <code>NERInference</code><a href="https://github.com/raynardj/forgebox/tree/master/forgebox/hf/train.py#L158" class="source_link" style="float:right">[source]</a></h2><blockquote><p><code>NERInference</code>(<strong><code>model</code></strong>, <strong><code>tokenizer</code></strong>, <strong><code>name</code></strong>=<em><code>None</code></em>)</p>
</blockquote>
<p>NER Inference pipeline
ner = NERInference.from_pretrained('xxxx/xxxx')
ner.predict(['text1','text2'])</p>

</div>

Expand Down
2 changes: 1 addition & 1 deletion forgebox/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.11"
__version__ = "0.4.17"
4 changes: 1 addition & 3 deletions forgebox/_nbdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,7 @@
"NERDataModule": "72_pl_training.ipynb",
"NERModule": "72_pl_training.ipynb",
"clean_ner_output": "72_pl_training.ipynb",
"predict_ner_table": "72_pl_training.ipynb",
"refined_ner_pipeline": "72_pl_training.ipynb",
"refined_ner_from_pretrained": "72_pl_training.ipynb",
"NERInference": "72_pl_training.ipynb",
"CudaDevice": "CUDA_GPU_Management.ipynb",
"CudaHandler": "CUDA_GPU_Management.ipynb",
"MLMVisualizer": "bert_visualize.ipynb",
Expand Down
216 changes: 187 additions & 29 deletions forgebox/hf/train.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified).

__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output',
'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained']
__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output', 'NERInference']

# Cell
from .data import IOBES
from ..imports import *
from ..loop import chunkify
import pytorch_lightning as pl
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
pipeline
)
from typing import Callable
from tqdm.notebook import tqdm
from typing import Callable, List
from torch import device

# Cell
try:
ishell = get_ipython()
IS_JUPYTER = True
from tqdm.notebook import tqdm
except NameError:
IS_JUPYTER = False
from tqdm import tqdm

# Cell

Expand Down Expand Up @@ -113,12 +124,12 @@ def clean_ner_output(self, outputs):
last_idx = 0
# make to sub group by position
for output in outputs:
if output["index"]-1 == last_idx:
if output["start"] in [last_idx, last_idx-1]:
current.append(output)
else:
results.append(current)
current = [output, ]
last_idx = output["index"]
last_idx = output["end"]
if len(current) > 0:
results.append(current)

Expand All @@ -139,30 +150,177 @@ def clean_ner_output(self, outputs):
word=new_str,
start=min(starts),
end=max(ends),
entity=c[0]['entity']
entity=c[0]['entity_group']
))
return strings

def predict_ner_table(pipeline_kw):
def predict_ner_table_(self, text: str) -> pd.DataFrame:
return pd.DataFrame(
self.clean_output(
self(text, **pipeline_kw)
)
)
return predict_ner_table_

def refined_ner_pipeline(model, tokenizer, **pipeline_kw):
if "aggregation_strategy" not in pipeline_kw:
pipeline_kw["aggregation_strategy"] = "first"

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

ner_pipeline.__class__.clean_output = clean_ner_output
ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)
return ner_pipeline

def refined_ner_from_pretrained(pretrained, **pipeline_kw):
model = AutoModelForTokenClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)
return refined_ner_pipeline(model, tokenizer, **pipeline_kw)
# Cell
class NERInference:
"""
NER Inference pipeline
ner = NERInference.from_pretrained('xxxx/xxxx')
ner.predict(['text1','text2'])
"""

def __init__(self, model, tokenizer, name=None):
super().__init__()
self.model = model.eval()
self.tokenizer = tokenizer
self.name = name if name else "NER model"

def __repr__(self):
return f"[NERInference on {self.name}]"

def to(self, device_str):
self.model = self.model.to(device(device_str))
return self

@classmethod
def from_pretrained(cls, tag):
"""
Load from pretrained model and tokenizer
"""
model = AutoModelForTokenClassification.from_pretrained(tag)
tokenizer = AutoTokenizer.from_pretrained(tag)
return cls(model=model, tokenizer=tokenizer, name=model.config._name_or_path)

def __call__(self, data, batch_size=32, dev=device("cpu")):
if type(data) == str:
return self.batch_predict([data,])
else:
return self.predict(data, dev=dev, batch_size=batch_size)

def predict(
self,
texts: List[str],
dev=device("cpu"),
batch_size: int = 32,
progress_bar: bool = True
) -> pd.DataFrame:
"""
Predict a list of sentences/ paragraphs
"""
# place the model into device
self.model = self.model.to(dev)
iterator = list(enumerate(chunkify(texts, bs=batch_size)))
if progress_bar:
iterator = tqdm(iterator, leave=False)

# run through iterator
all_dfs = []
for i, text_b in iterator:
# by batch prediction
batch_df = self.batch_predict(text_b)
if len(batch_df) > 0:
# calculate the row number
batch_df['text_id'] = batch_df.apply(
lambda row: i*batch_size+row.batch_row_sn, axis=1)
all_dfs.append(batch_df)

# place the model back to cpu
self.model = self.model.to("cpu")
return pd.concat(all_dfs).reset_index(drop=True)

def tokenizing(self, texts):
inputs = self.tokenizer(
texts,
padding="max_length",
max_length=self.tokenizer.model_max_length,
return_attention_mask=True,
return_tensors='pt', truncation=True, return_offsets_mapping=True
).to(self.model.device)
return inputs


def batch_predict(self, texts:List[str])-> pd.DataFrame:
"""
Predict a single batch of sentences
"""
id2label = self.model.config.id2label
inputs = self.tokenizing(texts)

with torch.no_grad():
outputs = self.model(input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask)
inputs = inputs.to(device('cpu'))

pred_idx = outputs.logits.argmax(-1).to(device("cpu"))
batch_size = pred_idx.size(0)
offsets = inputs.offset_mapping
results = []
for bi in range(batch_size):
text = texts[bi]
input_ids = inputs.input_ids[bi]
word_ids = inputs.word_ids(bi)
pred_ids = pred_idx[bi]
# initial values for the row
last_pos = 0
previous_has_positive = False
current_start = 0
current_index = 0
current_id = 0
line = []
for ti in range(1, len(input_ids)):
if input_ids[ti] == self.tokenizer.sep_token_id:
break
# is the current token an appending sub-word?
if word_ids[ti] == last_pos:
pass
# is current token negative
elif pred_ids[ti].item() == 0:
# store the previous hanging prediction
if previous_has_positive:
start = current_start
end = offsets[bi, ti, 0].item()
line.append({
"start": start, "end": end,
"entity": id2label[current_id],
"word": text[start:end],
"index": current_index,
})

current_start = offsets[bi, ti, 0].item()
previous_has_positive = False
current_id = 0
current_index = ti
# has positive prediction index, other than zero
else:
if previous_has_positive:
# different than the previous
if current_id != pred_ids[ti].item():
start = current_start
end = offsets[bi, ti, 0].item()
line.append({
"start": start,
"end": end,
"entity": id2label[current_id],
"word": text[start:end],
"index": current_index,
})
current_start = offsets[bi, ti, 0].item()
# this is the 1st postive predict for a while
else:
current_start = offsets[bi, ti, 0].item()
previous_has_positive = True
current_index = ti
current_id = pred_ids[ti].item()

last_pos = word_ids[ti]
if previous_has_positive:
start = current_start
end = offsets[bi, ti, 1].item()
line.append({
"start": start,
"end": end,
"entity": id2label[current_id],
"word": text[start:end],
"index": current_index,
})

results.append(line)
all_dfs = []
for i, res in enumerate(results):
sub_df = pd.DataFrame(res)
sub_df["batch_row_sn"] = i
all_dfs.append(sub_df)
return pd.concat(all_dfs)
Loading

0 comments on commit 9bb4a79

Please sign in to comment.