diff --git a/.gitignore b/.gitignore index e773429..689efa9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Custom +.mypy_cache/* +.vscode/* + .coverage *.sw[op] *.~ @@ -9,3 +13,5 @@ docs/_build hanziconv.egg-info htmlcov MANIFEST + +.cache/v/cache/lastfailed diff --git a/README.rst b/README.rst index 9f52296..41df868 100644 --- a/README.rst +++ b/README.rst @@ -98,7 +98,7 @@ Python API Example ******* -.. code-block:: pycon +.. code-block:: python >>> from hanziconv import HanziConv >>> print(HanziConv.toSimplified('繁簡轉換器')) @@ -138,4 +138,3 @@ License The character map used in this module is based on the Multi-function Chinese Character Database developed by Chinese University of Hong Kong. - diff --git a/hanziconv/charmap.py b/hanziconv/charmap.py index 0262c85..7c535c4 100644 --- a/hanziconv/charmap.py +++ b/hanziconv/charmap.py @@ -168,3 +168,6 @@ simplified_charmap = cuhk_simplified + extra_simplified traditional_charmap = cuhk_traditional + extra_traditional + +simp_to_trad = {s: t for s, t in zip(simplified_charmap, traditional_charmap)} +trad_to_simp = {t: s for s, t in zip(simplified_charmap, traditional_charmap)} diff --git a/hanziconv/hanziconv.py b/hanziconv/hanziconv.py index 7cd3983..e8e76f7 100644 --- a/hanziconv/hanziconv.py +++ b/hanziconv/hanziconv.py @@ -28,8 +28,10 @@ class by doing: >>> from hanziconv import HanziConv """ -import os -from .charmap import simplified_charmap, traditional_charmap +from .charmap import (simplified_charmap, + traditional_charmap, + simp_to_trad, + trad_to_simp) class HanziConv(object): @@ -39,7 +41,7 @@ class HanziConv(object): __simplified_charmap = simplified_charmap @classmethod - def __convert(cls, text, toTraditional=True): + def __convert(cls, text, toTraditional=True, custom_mapping=None): """Convert `text` to Traditional characters if `toTraditional` is True, else convert to simplified characters @@ -51,37 +53,41 @@ def __convert(cls, text, toTraditional=True): if isinstance(text, bytes): text = text.decode('utf-8') - fromMap = cls.__simplified_charmap - toMap = cls.__traditional_charmap + mapper = simp_to_trad if not toTraditional: - fromMap = cls.__traditional_charmap - toMap = cls.__simplified_charmap + mapper = trad_to_simp + + if custom_mapping: + assert isinstance(custom_mapping, dict), \ + 'custom_mapping should be a dictionary' + mapper.update(custom_mapping) final = [] for c in text: - index = fromMap.find(c) - if index != -1: - final.append(toMap[index]) + if c in mapper.keys(): + final.append(mapper[c]) else: final.append(c) return ''.join(final) @classmethod - def toSimplified(cls, text): + def toSimplified(cls, text, custom_mapping=None): """Convert `text` to simplified character string. Assuming text is traditional character string - :param text: text to convert - :returns: converted UTF-8 characters + :param text: text to convert + :param custom_mapping: A dictionary of custom mappings to override + hanziconv's defaults. + :returns: converted UTF-8 characters >>> from hanziconv import HanziConv >>> print(HanziConv.toSimplified('繁簡轉換器')) 繁简转换器 """ - return cls.__convert(text, toTraditional=False) + return cls.__convert(text, toTraditional=False, custom_mapping=custom_mapping) @classmethod - def toTraditional(cls, text): + def toTraditional(cls, text, custom_mapping=None): """Convert `text` to traditional character string. Assuming text is simplified character string @@ -91,25 +97,30 @@ def toTraditional(cls, text): >>> from hanziconv import HanziConv >>> print(HanziConv.toTraditional('繁简转换器')) 繁簡轉換器 + >>> print(HanziConv.toTraditional('祢是我的荣耀', custom_mapping={'祢': '祢'})) + 祢是我的榮耀 """ - return cls.__convert(text, toTraditional=True) + return cls.__convert(text, toTraditional=True, custom_mapping=custom_mapping) @classmethod - def same(cls, text1, text2): + def same(cls, text1, text2, custom_mapping=None): """Return True if text1 and text2 meant literally the same, False otherwise :param text1: string to compare to ``text2`` :param text2: string to compare to ``text1`` - :returns: **True** -- ``text1`` and ``text2`` are the same in meaning, + :returns: **True** -- ``text1`` and ``text2`` are the same in \ + meaning, **False** -- otherwise >>> from hanziconv import HanziConv >>> print(HanziConv.same('繁简转换器', '繁簡轉換器')) True + >>> print(HanziConv.same('祢是我的荣耀', '祢是我的榮耀', custom_mapping={'祢': '祢'})) + True """ - t1 = cls.toSimplified(text1) - t2 = cls.toSimplified(text2) + t1 = cls.toSimplified(text1, custom_mapping=custom_mapping) + t2 = cls.toSimplified(text2, custom_mapping=custom_mapping) return t1 == t2 diff --git a/hanziconv/specialnoun.py b/hanziconv/specialnoun.py index 5ce8dab..6960984 100644 --- a/hanziconv/specialnoun.py +++ b/hanziconv/specialnoun.py @@ -66,4 +66,3 @@ for data in data1, data2: for d in textwrap.dedent(data).strip().split(' '): s2t_exceptions[key].append(d) - diff --git a/requirements.txt b/requirements.txt index e69de29..dd0c136 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.18.4 +beautifulsoup4==4.6.0 +setuptools==36.3.0 diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 0000000..c5b132d --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,19 @@ +import pytest +from hanziconv import HanziConv + + +convert = HanziConv.toSimplified + + +def test_convert_custom_mapping(): + custom_mapping = { + '祢': '祢', + '面': '面', + '里': '裡', + '傢': '家', + } + + text = '住在祢傢里面' + expected = '住在祢家裡面' + + assert convert(text, custom_mapping=custom_mapping) == expected