Skip to content

Commit

Permalink
Unicode BOM and CRLF handling in check_encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
fginter committed Sep 19, 2014
1 parent 9cb7162 commit 3e6e867
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions check_encoding.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import sys
import codecs

import os

if __name__==u"__main__":

try:
f=codecs.getreader(u"utf-8")(sys.stdin)
for line in f:
print line.encode(u"utf-8")
#utf-8-sig interprets BOM as BOM not as space
inp=codecs.getreader("utf-8-sig")(os.fdopen(0,"U")) #Switches universal newlines on, so all newlines are now simply "\n"
for line in inp:
line=line.rstrip(u"\n")
print line.encode(u"utf-8") #
except UnicodeDecodeError:
print >> sys.stderr, "Error: Input file encoding is not utf-8, terminate parsing."
sys.exit(1)
Expand Down

1 comment on commit 3e6e867

@fginter
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixes #5

Please sign in to comment.