Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added optional parameter to API to allow for preservation of certain characters. #9

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Custom
.mypy_cache/*
.vscode/*

.coverage
*.sw[op]
*.~
Expand All @@ -9,3 +13,5 @@ docs/_build
hanziconv.egg-info
htmlcov
MANIFEST

.cache/v/cache/lastfailed
3 changes: 1 addition & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Python API
Example
*******

.. code-block:: pycon
.. code-block:: python

>>> from hanziconv import HanziConv
>>> print(HanziConv.toSimplified('繁簡轉換器'))
Expand Down Expand Up @@ -138,4 +138,3 @@ License

The character map used in this module is based on the Multi-function
Chinese Character Database developed by Chinese University of Hong Kong.

3 changes: 3 additions & 0 deletions hanziconv/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,6 @@

simplified_charmap = cuhk_simplified + extra_simplified
traditional_charmap = cuhk_traditional + extra_traditional

simp_to_trad = {s: t for s, t in zip(simplified_charmap, traditional_charmap)}
trad_to_simp = {t: s for s, t in zip(simplified_charmap, traditional_charmap)}
51 changes: 31 additions & 20 deletions hanziconv/hanziconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ class by doing:
>>> from hanziconv import HanziConv
"""

import os
from .charmap import simplified_charmap, traditional_charmap
from .charmap import (simplified_charmap,
traditional_charmap,
simp_to_trad,
trad_to_simp)


class HanziConv(object):
Expand All @@ -39,7 +41,7 @@ class HanziConv(object):
__simplified_charmap = simplified_charmap

@classmethod
def __convert(cls, text, toTraditional=True):
def __convert(cls, text, toTraditional=True, custom_mapping=None):
"""Convert `text` to Traditional characters if `toTraditional` is
True, else convert to simplified characters

Expand All @@ -51,37 +53,41 @@ def __convert(cls, text, toTraditional=True):
if isinstance(text, bytes):
text = text.decode('utf-8')

fromMap = cls.__simplified_charmap
toMap = cls.__traditional_charmap
mapper = simp_to_trad
if not toTraditional:
fromMap = cls.__traditional_charmap
toMap = cls.__simplified_charmap
mapper = trad_to_simp

if custom_mapping:
assert isinstance(custom_mapping, dict), \
'custom_mapping should be a dictionary'
mapper.update(custom_mapping)

final = []
for c in text:
index = fromMap.find(c)
if index != -1:
final.append(toMap[index])
if c in mapper.keys():
final.append(mapper[c])
else:
final.append(c)
return ''.join(final)

@classmethod
def toSimplified(cls, text):
def toSimplified(cls, text, custom_mapping=None):
"""Convert `text` to simplified character string. Assuming text is
traditional character string

:param text: text to convert
:returns: converted UTF-8 characters
:param text: text to convert
:param custom_mapping: A dictionary of custom mappings to override
hanziconv's defaults.
:returns: converted UTF-8 characters

>>> from hanziconv import HanziConv
>>> print(HanziConv.toSimplified('繁簡轉換器'))
繁简转换器
"""
return cls.__convert(text, toTraditional=False)
return cls.__convert(text, toTraditional=False, custom_mapping=custom_mapping)

@classmethod
def toTraditional(cls, text):
def toTraditional(cls, text, custom_mapping=None):
"""Convert `text` to traditional character string. Assuming text is
simplified character string

Expand All @@ -91,25 +97,30 @@ def toTraditional(cls, text):
>>> from hanziconv import HanziConv
>>> print(HanziConv.toTraditional('繁简转换器'))
繁簡轉換器
>>> print(HanziConv.toTraditional('祢是我的荣耀', custom_mapping={'祢': '祢'}))
祢是我的榮耀
"""
return cls.__convert(text, toTraditional=True)
return cls.__convert(text, toTraditional=True, custom_mapping=custom_mapping)

@classmethod
def same(cls, text1, text2):
def same(cls, text1, text2, custom_mapping=None):
"""Return True if text1 and text2 meant literally the same, False
otherwise

:param text1: string to compare to ``text2``
:param text2: string to compare to ``text1``
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
:returns: **True** -- ``text1`` and ``text2`` are the same in \
meaning,
**False** -- otherwise

>>> from hanziconv import HanziConv
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
True
>>> print(HanziConv.same('祢是我的荣耀', '祢是我的榮耀', custom_mapping={'祢': '祢'}))
True
"""
t1 = cls.toSimplified(text1)
t2 = cls.toSimplified(text2)
t1 = cls.toSimplified(text1, custom_mapping=custom_mapping)
t2 = cls.toSimplified(text2, custom_mapping=custom_mapping)
return t1 == t2


Expand Down
1 change: 0 additions & 1 deletion hanziconv/specialnoun.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,3 @@
for data in data1, data2:
for d in textwrap.dedent(data).strip().split(' '):
s2t_exceptions[key].append(d)

3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests==2.18.4
beautifulsoup4==4.6.0
setuptools==36.3.0
19 changes: 19 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pytest
from hanziconv import HanziConv


convert = HanziConv.toSimplified


def test_convert_custom_mapping():
custom_mapping = {
'祢': '祢',
'面': '面',
'里': '裡',
'傢': '家',
}

text = '住在祢傢里面'
expected = '住在祢家裡面'

assert convert(text, custom_mapping=custom_mapping) == expected