Unicode BOM and CRLF handling in check_encoding

TurkuNLP · Sep 19, 2014 · 3e6e867 · 3e6e867 · fginter · Sep 19, 2014
1 parent 9cb7162
commit 3e6e867
Showing 1 changed file with 6 additions and 5 deletions.
diff --git a/check_encoding.py b/check_encoding.py
@@ -1,13 +1,14 @@
 import sys
 import codecs
-
+import os
 
 if __name__==u"__main__":
-
     try:
-        f=codecs.getreader(u"utf-8")(sys.stdin)
-        for line in f:
-            print line.encode(u"utf-8")
+        #utf-8-sig interprets BOM as BOM not as space
+        inp=codecs.getreader("utf-8-sig")(os.fdopen(0,"U")) #Switches universal newlines on, so all newlines are now simply "\n"
+        for line in inp:
+            line=line.rstrip(u"\n")
+            print line.encode(u"utf-8") #
     except UnicodeDecodeError:
         print >> sys.stderr, "Error: Input file encoding is not utf-8, terminate parsing."
         sys.exit(1)