From 93d530234a4f5533aa99c3b897bb56d375c2ae60 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 16 Oct 2016 14:34:03 +0200 Subject: fix(unicode): use surrogateescape in bytes.decode That way, we will try to decode as default encoding (usually utf-8), but allow ourselves to simply keep bytes that don't match within the resulting unicode string. That way, we allow for lossless decode/encode cycles while still assuring that decoding never fails. NOTE: I was too lazy to create a test that would verify it, but manually executed https://github.com/petertodd/gitpython-unicode-error. fixes #532 --- git/objects/fun.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'git/objects/fun.py') diff --git a/git/objects/fun.py b/git/objects/fun.py index 5c0f4819..a144ba7e 100644 --- a/git/objects/fun.py +++ b/git/objects/fun.py @@ -76,11 +76,7 @@ def tree_entries_from_data(data): # default encoding for strings in git is utf8 # Only use the respective unicode object if the byte stream was encoded name = data[ns:i] - try: - name = name.decode(defenc) - except UnicodeDecodeError: - pass - # END handle encoding + name = name.decode(defenc, 'surrogateescape') # byte is NULL, get next 20 i += 1 -- cgit v1.2.3