From eadad438b7866ebdbe5a3cb935f0e326ed956127 Mon Sep 17 00:00:00 2001 From: Eric Ma Date: Wed, 30 Aug 2017 13:47:34 -0400 Subject: [PATCH 1/4] Added optional parameter to API to allow for preservation of certain characters. --- .gitignore | 2 ++ hanziconv/charmap.py | 3 +++ hanziconv/hanziconv.py | 50 ++++++++++++++++++++++++++-------------- hanziconv/specialnoun.py | 1 - requirements.txt | 3 +++ 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index e773429..dec8bef 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ docs/_build hanziconv.egg-info htmlcov MANIFEST + +.cache/v/cache/lastfailed diff --git a/hanziconv/charmap.py b/hanziconv/charmap.py index 0262c85..7c535c4 100644 --- a/hanziconv/charmap.py +++ b/hanziconv/charmap.py @@ -168,3 +168,6 @@ simplified_charmap = cuhk_simplified + extra_simplified traditional_charmap = cuhk_traditional + extra_traditional + +simp_to_trad = {s: t for s, t in zip(simplified_charmap, traditional_charmap)} +trad_to_simp = {t: s for s, t in zip(simplified_charmap, traditional_charmap)} diff --git a/hanziconv/hanziconv.py b/hanziconv/hanziconv.py index 7cd3983..9f4a0db 100644 --- a/hanziconv/hanziconv.py +++ b/hanziconv/hanziconv.py @@ -29,7 +29,10 @@ class by doing: """ import os -from .charmap import simplified_charmap, traditional_charmap +from .charmap import (simplified_charmap, + traditional_charmap, + simp_to_trad, + trad_to_simp) class HanziConv(object): @@ -39,7 +42,7 @@ class HanziConv(object): __simplified_charmap = simplified_charmap @classmethod - def __convert(cls, text, toTraditional=True): + def __convert(cls, text, toTraditional=True, preserve=None): """Convert `text` to Traditional characters if `toTraditional` is True, else convert to simplified characters @@ -51,23 +54,31 @@ def __convert(cls, text, toTraditional=True): if isinstance(text, bytes): text = text.decode('utf-8') - fromMap = cls.__simplified_charmap - toMap = cls.__traditional_charmap + # fromMap = cls.__simplified_charmap + # toMap = cls.__traditional_charmap + # if not toTraditional: + # fromMap = cls.__traditional_charmap + # toMap = cls.__simplified_charmap + + mapper = simp_to_trad if not toTraditional: - fromMap = cls.__traditional_charmap - toMap = cls.__simplified_charmap + mapper = trad_to_simp + + if preserve: + assert isinstance(preserve, str), \ + 'Preserve should be a string of characters' + mapper.update({c: c for c in preserve}) final = [] for c in text: - index = fromMap.find(c) - if index != -1: - final.append(toMap[index]) + if c in mapper.keys(): + final.append(mapper[c]) else: final.append(c) return ''.join(final) @classmethod - def toSimplified(cls, text): + def toSimplified(cls, text, preserve=None): """Convert `text` to simplified character string. Assuming text is traditional character string @@ -78,10 +89,10 @@ def toSimplified(cls, text): >>> print(HanziConv.toSimplified('繁簡轉換器')) 繁简转换器 """ - return cls.__convert(text, toTraditional=False) + return cls.__convert(text, toTraditional=False, preserve=preserve) @classmethod - def toTraditional(cls, text): + def toTraditional(cls, text, preserve=None): """Convert `text` to traditional character string. Assuming text is simplified character string @@ -91,25 +102,30 @@ def toTraditional(cls, text): >>> from hanziconv import HanziConv >>> print(HanziConv.toTraditional('繁简转换器')) 繁簡轉換器 + >>> print(HanziConv.toTraditional('祢是我的荣耀', preserve='祢')) + 祢是我的榮耀 """ - return cls.__convert(text, toTraditional=True) + return cls.__convert(text, toTraditional=True, preserve=preserve) @classmethod - def same(cls, text1, text2): + def same(cls, text1, text2, preserve=None): """Return True if text1 and text2 meant literally the same, False otherwise :param text1: string to compare to ``text2`` :param text2: string to compare to ``text1`` - :returns: **True** -- ``text1`` and ``text2`` are the same in meaning, + :returns: **True** -- ``text1`` and ``text2`` are the same in \ + meaning, **False** -- otherwise >>> from hanziconv import HanziConv >>> print(HanziConv.same('繁简转换器', '繁簡轉換器')) True + >>> print(HanziConv.same('祢是我的荣耀', '祢是我的榮耀', preserve='祢')) + True """ - t1 = cls.toSimplified(text1) - t2 = cls.toSimplified(text2) + t1 = cls.toSimplified(text1, preserve=preserve) + t2 = cls.toSimplified(text2, preserve=preserve) return t1 == t2 diff --git a/hanziconv/specialnoun.py b/hanziconv/specialnoun.py index 5ce8dab..6960984 100644 --- a/hanziconv/specialnoun.py +++ b/hanziconv/specialnoun.py @@ -66,4 +66,3 @@ for data in data1, data2: for d in textwrap.dedent(data).strip().split(' '): s2t_exceptions[key].append(d) - diff --git a/requirements.txt b/requirements.txt index e69de29..dd0c136 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.18.4 +beautifulsoup4==4.6.0 +setuptools==36.3.0 From 4d6caa569a3a692087d5243cae42b2d649ab0bf2 Mon Sep 17 00:00:00 2001 From: Eric Ma Date: Sun, 3 Mar 2019 16:14:38 -0500 Subject: [PATCH 2/4] Ignoring mypy_cache and vscode --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index dec8bef..689efa9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Custom +.mypy_cache/* +.vscode/* + .coverage *.sw[op] *.~ From e23c85f2a069361fefe92f7b38d4a4274d638ea1 Mon Sep 17 00:00:00 2001 From: Eric Ma Date: Sun, 3 Mar 2019 16:32:23 -0500 Subject: [PATCH 3/4] added tests for custom mapping --- hanziconv/hanziconv.py | 41 ++++++++++++++++++----------------------- tests/test_convert.py | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 23 deletions(-) create mode 100644 tests/test_convert.py diff --git a/hanziconv/hanziconv.py b/hanziconv/hanziconv.py index 9f4a0db..e8e76f7 100644 --- a/hanziconv/hanziconv.py +++ b/hanziconv/hanziconv.py @@ -28,7 +28,6 @@ class by doing: >>> from hanziconv import HanziConv """ -import os from .charmap import (simplified_charmap, traditional_charmap, simp_to_trad, @@ -42,7 +41,7 @@ class HanziConv(object): __simplified_charmap = simplified_charmap @classmethod - def __convert(cls, text, toTraditional=True, preserve=None): + def __convert(cls, text, toTraditional=True, custom_mapping=None): """Convert `text` to Traditional characters if `toTraditional` is True, else convert to simplified characters @@ -54,20 +53,14 @@ def __convert(cls, text, toTraditional=True, preserve=None): if isinstance(text, bytes): text = text.decode('utf-8') - # fromMap = cls.__simplified_charmap - # toMap = cls.__traditional_charmap - # if not toTraditional: - # fromMap = cls.__traditional_charmap - # toMap = cls.__simplified_charmap - mapper = simp_to_trad if not toTraditional: mapper = trad_to_simp - if preserve: - assert isinstance(preserve, str), \ - 'Preserve should be a string of characters' - mapper.update({c: c for c in preserve}) + if custom_mapping: + assert isinstance(custom_mapping, dict), \ + 'custom_mapping should be a dictionary' + mapper.update(custom_mapping) final = [] for c in text: @@ -78,21 +71,23 @@ def __convert(cls, text, toTraditional=True, preserve=None): return ''.join(final) @classmethod - def toSimplified(cls, text, preserve=None): + def toSimplified(cls, text, custom_mapping=None): """Convert `text` to simplified character string. Assuming text is traditional character string - :param text: text to convert - :returns: converted UTF-8 characters + :param text: text to convert + :param custom_mapping: A dictionary of custom mappings to override + hanziconv's defaults. + :returns: converted UTF-8 characters >>> from hanziconv import HanziConv >>> print(HanziConv.toSimplified('繁簡轉換器')) 繁简转换器 """ - return cls.__convert(text, toTraditional=False, preserve=preserve) + return cls.__convert(text, toTraditional=False, custom_mapping=custom_mapping) @classmethod - def toTraditional(cls, text, preserve=None): + def toTraditional(cls, text, custom_mapping=None): """Convert `text` to traditional character string. Assuming text is simplified character string @@ -102,13 +97,13 @@ def toTraditional(cls, text, preserve=None): >>> from hanziconv import HanziConv >>> print(HanziConv.toTraditional('繁简转换器')) 繁簡轉換器 - >>> print(HanziConv.toTraditional('祢是我的荣耀', preserve='祢')) + >>> print(HanziConv.toTraditional('祢是我的荣耀', custom_mapping={'祢': '祢'})) 祢是我的榮耀 """ - return cls.__convert(text, toTraditional=True, preserve=preserve) + return cls.__convert(text, toTraditional=True, custom_mapping=custom_mapping) @classmethod - def same(cls, text1, text2, preserve=None): + def same(cls, text1, text2, custom_mapping=None): """Return True if text1 and text2 meant literally the same, False otherwise @@ -121,11 +116,11 @@ def same(cls, text1, text2, preserve=None): >>> from hanziconv import HanziConv >>> print(HanziConv.same('繁简转换器', '繁簡轉換器')) True - >>> print(HanziConv.same('祢是我的荣耀', '祢是我的榮耀', preserve='祢')) + >>> print(HanziConv.same('祢是我的荣耀', '祢是我的榮耀', custom_mapping={'祢': '祢'})) True """ - t1 = cls.toSimplified(text1, preserve=preserve) - t2 = cls.toSimplified(text2, preserve=preserve) + t1 = cls.toSimplified(text1, custom_mapping=custom_mapping) + t2 = cls.toSimplified(text2, custom_mapping=custom_mapping) return t1 == t2 diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 0000000..c5b132d --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,19 @@ +import pytest +from hanziconv import HanziConv + + +convert = HanziConv.toSimplified + + +def test_convert_custom_mapping(): + custom_mapping = { + '祢': '祢', + '面': '面', + '里': '裡', + '傢': '家', + } + + text = '住在祢傢里面' + expected = '住在祢家裡面' + + assert convert(text, custom_mapping=custom_mapping) == expected From 5ce5223db458a8a4338b59269ffcb6ecb038c4dc Mon Sep 17 00:00:00 2001 From: Eric Ma Date: Sun, 3 Mar 2019 16:32:30 -0500 Subject: [PATCH 4/4] minor fixes to readme --- README.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 9f52296..41df868 100644 --- a/README.rst +++ b/README.rst @@ -98,7 +98,7 @@ Python API Example ******* -.. code-block:: pycon +.. code-block:: python >>> from hanziconv import HanziConv >>> print(HanziConv.toSimplified('繁簡轉換器')) @@ -138,4 +138,3 @@ License The character map used in this module is based on the Multi-function Chinese Character Database developed by Chinese University of Hong Kong. -