Skip to content

Commit

Permalink
⚡️ perset tag
Browse files Browse the repository at this point in the history
  • Loading branch information
raynardj committed Sep 1, 2021
1 parent 9f9e8bc commit 0b3b64e
Show file tree
Hide file tree
Showing 9 changed files with 408 additions and 35 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,26 @@ app = ClassifyTask.from_df(
cross_verify_num=2,)
```

#### Preset the tagging
You can set a column in dataframe, eg. called ```guessed_tags```, to preset the tagging result.

Each cell can contain the format of tagging result, eg.
```json
{"tags":[
{"text": "Genomicare Bio Tech", "offset":32, "label":"company"},
{"text": "East China University of Politic Science & Law", "offset":96, "label":"company"},
]}
```

Then you can run the app with preset tag column
```python
app = NERTask.from_df(
df, text_col="description",
options=["institution", "company", "name"],
preset_tag_col="guessed_tags")
app.run("0.0.0.0", port=5000)
```

#### Order strategy
The order of which text got tagged first is according to order_strategy.

Expand Down
4 changes: 3 additions & 1 deletion docs/loader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ len(train_ds), len(val_ds)


```python
x, y = data_ds.one_batch(5)
batch = data_ds.one_batch(5)
x = batch['input_ids']
y = batch['targets']
```


Expand Down
2 changes: 1 addition & 1 deletion langhuan/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
dict(options=self.option_vals, idx=self.df_idx))

self.option_col = self.df["options"]
self.df.set_index("idx")
self.df = self.df.set_index("idx")

def __len__(self): return len(self.df)

Expand Down
18 changes: 17 additions & 1 deletion langhuan/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def from_df(
task_name: str = None,
options: List[str] = None,
load_history: bool = False,
preset_tag_col: str = None,
save_frequency: int = 42,
order_strategy: Union[str, Callable] = "forward_march",
order_by_column: str = None,
Expand All @@ -42,6 +43,8 @@ def from_df(
you don't even have to decide this now, you can input
None and configure it on /admin page later
- load_history: bool = False, load the saved history if True
- preset_tag_col: str = None, a column name
that contains preset tags
- task_name: str, name of your task, if not provided
- order_strategy: Union[str, Callable] = "forward_march",
a function defining how progress move one by one. As a
Expand Down Expand Up @@ -90,7 +93,7 @@ def from_df(
app.config['TEMPLATES_AUTO_RELOAD'] = True
HyperFlask(app)

app.register(df, text_col, Options(df, options))
app.register(df, text_col, preset_tag_col, Options(df, options))
app.create_progress(
order_strategy,
order_by_column,
Expand Down Expand Up @@ -219,9 +222,16 @@ def raw_data():

rt = dict(idx=idx, index=index, text=text, options=list(options))

# Scenario 1, when we have tagged data in progress
if user_id in self.progress.depth[index].keys():
rt.update({"record": self.progress.depth[index][user_id]})

# Scenario 2, when we have preset tag defined in dataframe
elif self.preset_tag_col is not None:
preset_tag = self.df.loc[idx, self.preset_tag_col]
if preset_tag is not None:
rt.update({"record": preset_tag})

return jsonify(rt)

@self.route("/tagging", methods=["POST"])
Expand Down Expand Up @@ -357,10 +367,16 @@ def register(
self,
df: pd.DataFrame,
text_col: Union[List[str], str],
preset_tag_col: str,
options: List[str],
) -> None:
"""
Register properties to the class
"""
self.df = df
self.text_col = text_col
self.preset_tag_col = preset_tag_col

self.options = options
self.register_functions()

Expand Down
47 changes: 28 additions & 19 deletions langhuan/templates/ner.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
{{super()}}
<h4 class="m-3 bg-warning p-2 text-white text-center rounded" title="Named Entity Recognition">
<i class="fas fa-dove"></i>
NER task</h4>
NER task
</h4>
{%endblock%}

{%block controls%}
Expand Down Expand Up @@ -54,7 +55,7 @@ <h5 class="mt-2 mb-2">
const create_option_list = (options) => {
var option_list = [];
var option_to_data = {}
var color_ct=0;
var color_ct = 0;
for (var i in options) {
if (color_ct >= flavors.length) {
color_ct = 0
Expand All @@ -80,18 +81,24 @@ <h5 class="mt-2 mb-2">
this.node.title = d.label;
var node = this.node

/*
Click the selected to remove tag
*/
this.node.addEventListener("click", () => {
node.after(node.innerText)
node.remove()
})
}

get_node(){
get_node() {
return this.node
}
}

const calc_label = () => {
/*
From the DOM, get the tagged data
*/
var nodes = document.querySelector("#raw").childNodes

var tags = [];
Expand Down Expand Up @@ -119,7 +126,6 @@ <h5 class="mt-2 mb-2">
tagging(data)
}


document.querySelector("#next").addEventListener("click", next_btn)
document.querySelector("#skip").addEventListener("click", skipping)

Expand Down Expand Up @@ -155,24 +161,27 @@ <h5 class="mt-2 mb-2">
const get_history = async () => {
var user_id = get_user_id()
fetch(`/personal_history?user_id=${user_id}`)
.then(res=>res.json())
.then(data=>{
d3.select("#histories")
.selectAll(".history_entry")
.data(data)
.enter()
.append("div")
.attr("class",
"history_entry m-3 pt-1 pb-1 pl-3 pr-3 border border-primary rounded-pill border-3 border-top-0 border-bottom-0")
.text(d=>{return `${d.time.substring(9,17)} (${d.tags}x🏷)`})
.on("click", (e, data) => {
window.location = `/?index=${data.index}`
.then(res => res.json())
.then(data => {
d3.select("#histories")
.selectAll(".history_entry")
.data(data)
.enter()
.append("div")
.attr("class",
"history_entry m-3 pt-1 pb-1 pl-3 pr-3 border border-primary rounded-pill border-3 border-top-0 border-bottom-0")
.text(d => { return `${d.time.substring(9, 17)} (${d.tags}x🏷)` })
.on("click", (e, data) => {
window.location = `/?index=${data.index}`
})
})
})
.catch(console.error)
.catch(console.error)
}

const visualize_options = (options) => {
/*
Visualize the options as buttons
*/
d3.select("#label_pool").selectAll("button")
.data(options)
.enter().append("button")
Expand Down Expand Up @@ -214,7 +223,7 @@ <h5 class="mt-2 mb-2">
subs.push({ text: text.substring(ct, text.length) })
// console.log(subs)

raw.innerHTML=""
raw.innerHTML = ""

for (var i in subs) {
var sub = subs[i];
Expand Down
10 changes: 10 additions & 0 deletions langhuan/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Union
import json
from pathlib import Path
import regex


def now_str(): return datetime.now().strftime("%y%m%d_%H%M%S")
Expand Down Expand Up @@ -38,3 +39,12 @@ def arg_by_key(key: str) -> Union[str, int, float]:

def get_root() -> Path:
return Path(__file__).parent.absolute()


def findall_word_position(text: str, word: str) -> list:
"""
find all the position of word in text
"""
text = text.lower()
word = word.lower()
return [m.start() for m in regex.finditer(word, text)]
2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ keywords = python pandas label data science
author = xiaochen(ray) zhang
author_email = [email protected]
branch = main
version = 0.0.18
version = 0.1.0
min_python = 3.6
audience = Developers
language = English
Expand Down
92 changes: 92 additions & 0 deletions tests/loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,98 @@
"len(train_ds), len(val_ds)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'index': 0,\n",
" 'now': '21-02-01_15:39:20',\n",
" 'pandas': 0,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school',\n",
" 'offset': 122,\n",
" 'text': 'University of Maryland'},\n",
" {'label': 'company', 'offset': 346, 'text': 'Bricklin'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 1,\n",
" 'now': '21-02-01_15:38:29',\n",
" 'pandas': 1,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school',\n",
" 'offset': 213,\n",
" 'text': 'University of Washington'},\n",
" {'label': 'company', 'offset': 340, 'text': 'SI'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 2,\n",
" 'now': '21-02-01_15:39:03',\n",
" 'pandas': 2,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school', 'offset': 89, 'text': 'Purdue University'},\n",
" {'label': 'company', 'offset': 107, 'text': 'Engineering Computer Network'},\n",
" {'label': 'company',\n",
" 'offset': 1795,\n",
" 'text': 'Purdue Electrical Engineering'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 3,\n",
" 'now': '21-02-01_15:39:11',\n",
" 'pandas': 3,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'company',\n",
" 'offset': 73,\n",
" 'text': 'Harris Computer Systems Division'},\n",
" {'label': 'company', 'offset': 645, 'text': 'Harris Corporation'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 4,\n",
" 'now': '21-02-01_15:40:03',\n",
" 'pandas': 4,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school',\n",
" 'offset': 102,\n",
" 'text': 'Smithsonian Astrophysical Observatory'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 5,\n",
" 'now': '21-02-01_15:40:38',\n",
" 'pandas': 5,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 6,\n",
" 'now': '21-02-01_15:40:55',\n",
" 'pandas': 6,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school',\n",
" 'offset': 151,\n",
" 'text': 'University of Chicago'}],\n",
" 'user_id': '4de71c07fa'},\n",
" {'index': 7,\n",
" 'now': '21-02-01_15:43:01',\n",
" 'pandas': 7,\n",
" 'remote_addr': '127.0.0.1',\n",
" 'tags': [{'label': 'school',\n",
" 'offset': 75,\n",
" 'text': 'New Mexico State University'},\n",
" {'label': 'company', 'offset': 1936, 'text': 'IBM'},\n",
" {'label': 'company', 'offset': 2151, 'text': 'IBM'},\n",
" {'label': 'company', 'offset': 2238, 'text': 'Quadra'},\n",
" {'label': 'company', 'offset': 2266, 'text': 'Apple'},\n",
" {'label': 'company', 'offset': 2753, 'text': 'Quadra'},\n",
" {'label': 'company', 'offset': 2790, 'text': 'Digital Review'}],\n",
" 'user_id': '4de71c07fa'}]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_ds.labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Loading

0 comments on commit 0b3b64e

Please sign in to comment.