From 9088b1ae73efa75939622d2b101700a12cb96e67 Mon Sep 17 00:00:00 2001 From: raynardj Date: Thu, 11 Feb 2021 16:22:25 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=8C=20easier=20and=20faster=20dispatch?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- langhuan/progress.py | 109 +++++++++++------------------------- langhuan/tasks.py | 2 + langhuan/version.py | 2 +- ner_test.ipynb | 99 ++------------------------------ settings.ini | 2 +- tests/dispatcher_test.ipynb | 91 ++++++++++++++++++++++++++---- 6 files changed, 124 insertions(+), 181 deletions(-) diff --git a/langhuan/progress.py b/langhuan/progress.py index 1b692bd..4997866 100644 --- a/langhuan/progress.py +++ b/langhuan/progress.py @@ -1,89 +1,47 @@ from typing import List, Callable, Union import logging + class Dispatcher: def __init__(self, n, v): self.n = n self.v = v - self.sent = -1 - self.started = dict() - self.by_user = dict() - self.busy_by_user = dict() - - def new_user_todo(self, user_id): - return {user_id: list(range(self.n))[max(0, self.sent):]} - - def user_progress(self, user_id): - try: - user_progress = self.by_user[user_id] - except KeyError: - self.by_user.update(self.new_user_todo(user_id)) - user_progress = self.by_user[user_id] - return user_progress + self.cache_data = dict() + self.new = list(range(n)) + self.processing = dict() def __repr__(self): - return str(self.by_user)+"\n"+str(self.sent) + return f"Job Dispatcher: n:{self.n},v:{self.v}" def __getitem__(self, user_id): - """ - get index by user_id - """ - if user_id in self.busy_by_user: - # read cache - return self.busy_by_user[user_id] - - self.user_clear_progress(user_id) - user = self.user_progress(user_id) - try: - index = user[0] - self.after_get_update(user_id, index) - - except IndexError: - return -1 - return index - - def after_get_update(self, user_id, index): - # save cache - self.busy_by_user[user_id] = index - user = self.user_progress(user_id) - if index in user: - user.remove(index) - if self.started.get(index) is None: - self.started[index] = [user_id, ] + if user_id in self.cache_data: + return self.cache_data[user_id] else: - self.started[index].append(user_id) - if len(self.started[index]) >= self.v: - self.tick_sent(index) - - def finish_update( - self, - user_id: str, - index: int, - callbacks: List[Callable] = [] - ): - """ - callbacks allow for furthur manuvers - each callback function can process: - callback(user_id, index) - """ - # delete cache - if self.busy_by_user.get(user_id) == index: - del self.busy_by_user[user_id] - for callback in callbacks: - callback(user_id, index) - - def user_clear_progress(self, user_id): - user_progress = self.user_progress(user_id) - - new_progress = [] - for i in user_progress: - if i > self.sent: - new_progress.append(i) - self.by_user[user_id] = new_progress + for k, v in self.processing.items(): + if len(self.processing[k]) >= self.v: + continue + if user_id in v: + continue + else: + v.append(user_id) + self.cache_data[user_id] = k + return k + + # read_new + if len(self.new) > 0: + item = self.new[0] + self.processing[item] = [] + self.new.remove(item) + return self[user_id] + else: + return -1 - def tick_sent(self, index): - self.sent = index - del self.started[index] + def finish_update(self, user_id, index): + if user_id in self.cache_data: + del self.cache_data[user_id] + if index in self.processing: + if len(self.processing[index]) >= self.v: + del self.processing[index] class Progress: @@ -129,11 +87,12 @@ def recover_history(self, data): Recover single entry of history data to the dispatcher """ - self.tagging(data) user_id = data["user_id"] pandas = data["pandas"] + + # using pandas to map to new index index = self.idx_to_index[pandas] - self.dispatcher.after_get_update(user_id=user_id, index=index) + self.tagging(data) self.dispatcher.finish_update(user_id=user_id, index=index) def tagging(self, data): diff --git a/langhuan/tasks.py b/langhuan/tasks.py index 9507f9b..4193f22 100644 --- a/langhuan/tasks.py +++ b/langhuan/tasks.py @@ -97,10 +97,12 @@ def from_df( cross_verify_num=cross_verify_num) if load_history: + logging.info(f"start loading history") # loading the history to progress if len(app.task_history.history) > 0: for data in app.task_history.history: app.progress.recover_history(data) + logging.info(f"history loaded") return app def create_progress( diff --git a/langhuan/version.py b/langhuan/version.py index 9b36b86..b2f0155 100644 --- a/langhuan/version.py +++ b/langhuan/version.py @@ -1 +1 @@ -__version__ = "0.0.10" +__version__ = "0.0.11" diff --git a/ner_test.ipynb b/ner_test.ipynb index 7b3f938..3dcf034 100644 --- a/ner_test.ipynb +++ b/ner_test.ipynb @@ -65,23 +65,23 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " From: lerxst@wam.umd.edu (where's my thing)\\nS...\n", " \n", " \n", - " 1\n", + " 1\n", " From: guykuo@carson.u.washington.edu (Guy Kuo)...\n", " \n", " \n", - " 2\n", + " 2\n", " From: twillis@ec.ecn.purdue.edu (Thomas E Will...\n", " \n", " \n", - " 3\n", + " 3\n", " From: jgreen@amber (Joe Green)\\nSubject: Re: W...\n", " \n", " \n", - " 4\n", + " 4\n", " From: jcm@head-cfa.harvard.edu (Jonathan McDow...\n", " \n", " \n", @@ -130,95 +130,6 @@ "app.run(\"0.0.0.0\", port=5000)" ] }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Corporation\\njgreen@csd.harris.com\\t\\t\\tComputer Systems Division\\n\"The only thing that really scares me is a person with no sense of humor.\"\\n\\t\\t\\t\\t\\t\\t-- Jonathan Winters\\n'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[3][\"text\"][652:]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from langhuan.utility import cleanup_tags" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(\"˃˃˃\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(\"˂˂˂\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Harris Corporation\\njgreen@csd.harris.com\\t\\t\\tComputer Systems Division\\n\"The only thing that really scares me is a person with no sense of humor.\"\\n\\t\\t\\t\\t\\t\\t-- Jonathan Winters\\n'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cleanup_tags(df.loc[3][\"text\"])[652:]" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/settings.ini b/settings.ini index b99db52..c6edc03 100644 --- a/settings.ini +++ b/settings.ini @@ -6,7 +6,7 @@ keywords = python pandas label data science author = xiaochen(ray) zhang author_email = b2ray2c@gmail.com branch = main -version = 0.0.10 +version = 0.0.11 min_python = 3.6 audience = Developers language = English diff --git a/tests/dispatcher_test.ipynb b/tests/dispatcher_test.ipynb index b8a6233..24475d8 100644 --- a/tests/dispatcher_test.ipynb +++ b/tests/dispatcher_test.ipynb @@ -15,23 +15,80 @@ "metadata": {}, "outputs": [], "source": [ - "logger = logging.getLogger()\n", - "logger.setLevel(logging.DEBUG)" + "import unittest" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "class Dispatcher:\n", + " def __init__(self, n, v):\n", + " self.n = n\n", + " self.v = v\n", + " self.cache_data = dict()\n", + " self.new = list(range(n))\n", + " self.processing = dict()\n", + " \n", + " def __repr__(self):\n", + " return f\"Job Dispatcher: n:{self.n},v{self.v}\"\n", + " \n", + " def __getitem__(self, user_id):\n", + " if user_id in self.cache_data:\n", + " return self.cache_data[user_id]\n", + " else:\n", + " for k, v in self.processing.items():\n", + " if len(self.processing[k]) >= self.v:\n", + " continue\n", + " if user_id in v:\n", + " continue\n", + " else:\n", + " v.append(user_id)\n", + " self.cache_data[user_id] = k\n", + " return k\n", + "\n", + " # read_new\n", + " if len(self.new)>0:\n", + " item = self.new[0]\n", + " self.processing[item] = []\n", + " self.new.remove(item)\n", + " return self[user_id]\n", + " else:\n", + " return -1\n", + " \n", + " def finish_update(self, user_id, index):\n", + " if user_id in self.cache_data:\n", + " del self.cache_data[user_id]\n", + " if index in self.processing:\n", + " if len(self.processing[index]) >= self.v:\n", + " del self.processing[index]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# from langhuan.progress import Dispatcher" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "import unittest\n", - "from langhuan.progress import Dispatcher" + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -87,7 +144,7 @@ " dispatcher.finish_update(2, index=i2)\n", " if i % 3 != 2:\n", " dispatcher.finish_update(3, index=i3)\n", - "\n", + " print(result)\n", " expected = {\n", " 'step_0': [0, 0, 1],\n", " 'step_1': [0, 1, 2],\n", @@ -107,16 +164,30 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "...\n", + "..." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'step_0': [0, 0, 1], 'step_1': [0, 1, 2], 'step_2': [2, 1, 3], 'step_3': [3, 4, 3], 'step_4': [3, 5, 4], 'step_5': [5, 5, 6], 'step_6': [6, 7, 6], 'step_7': [6, 8, 7], 'step_8': [8, 8, 9], 'step_9': [9, -1, 9], 'step_10': [9, -1, -1], 'step_11': [-1, -1, -1]}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", "----------------------------------------------------------------------\n", - "Ran 3 tests in 0.004s\n", + "Ran 3 tests in 0.008s\n", "\n", "OK\n" ]