Skip to content

Commit

Permalink
Auto detects when the text is latin1 or ascii and decode it as UTF-8 …
Browse files Browse the repository at this point in the history
…so no strange chars are stored or used for the entry version comparisons
  • Loading branch information
nahuelhds committed Jun 2, 2020
1 parent a15b052 commit acaee8e
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Pip*
.env
venv
.idea
*.ignore.py
4 changes: 2 additions & 2 deletions diffengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import unicodedata

from diffengine.sendgrid import SendgridHandler
from diffengine.text import to_utf8
from diffengine.twitter import TwitterHandler

from exceptions.webdriver import UnknownWebdriverError
Expand Down Expand Up @@ -173,8 +174,7 @@ def get_latest(self):
if resp.status_code != 200:
logging.warn("Got %s when fetching %s", resp.status_code, self.url)
return None

doc = readability.Document(resp.text)
doc = readability.Document(to_utf8(resp.text))
title = doc.title()
summary = doc.summary(html_partial=True)
summary = bleach.clean(summary, tags=["p"], strip=True)
Expand Down
14 changes: 14 additions & 0 deletions diffengine/text_builder.py → diffengine/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,17 @@ def build_with_default_content(diff):
text = text[0:225] + "…"
text += " " + diff.url
return text


def to_utf8(text):
for encoding in ["latin1", "ascii"]:
try:
result = text.encode(encoding).decode("utf8", "strict")
break
except (UnicodeEncodeError, UnicodeDecodeError):
result = None

if result is None:
return text

return result
2 changes: 1 addition & 1 deletion diffengine/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from datetime import datetime

from diffengine.text_builder import build_text
from diffengine.text import build_text
from exceptions.twitter import (
AlreadyTweetedError,
TwitterConfigNotFoundError,
Expand Down
15 changes: 14 additions & 1 deletion test_diffengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
SendgridHandler,
_fingerprint,
)
from diffengine.text_builder import build_text
from diffengine.text import build_text, to_utf8
from diffengine.utils import generate_config
from exceptions.sendgrid import (
SendgridConfigNotFoundError,
Expand Down Expand Up @@ -804,3 +804,16 @@ def test_lang_content_text(self):
self.assertEqual(
text, "change in the URL, the title and the summary\n%s" % diff.url
)


class EncodingTest(TestCase):
def test_utf8_do_nothingg(self):
text_utf8 = "Me preocupa más la parte futbolística"
result = to_utf8(text_utf8)
self.assertEquals(result, text_utf8)

def test_latin1_to_utf8(self):
text_latin = "Me preocupa más la parte futbolística"
text_utf8 = "Me preocupa más la parte futbolística"
result = to_utf8(text_latin)
self.assertEquals(result, text_utf8)

0 comments on commit acaee8e

Please sign in to comment.