From 93d530234a4f5533aa99c3b897bb56d375c2ae60 Mon Sep 17 00:00:00 2001
From: Sebastian Thiel <byronimo@gmail.com>
Date: Sun, 16 Oct 2016 14:34:03 +0200
Subject: fix(unicode): use surrogateescape in bytes.decode

That way, we will try to decode as default encoding (usually
utf-8), but allow ourselves to simply keep bytes that don't
match within the resulting unicode string.

That way, we allow for lossless decode/encode cycles while still
assuring that decoding never fails.

NOTE: I was too lazy to create a test that would verify it, but manually
executed https://github.com/petertodd/gitpython-unicode-error.

fixes #532
---
 git/objects/fun.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'git/objects/fun.py')

diff --git a/git/objects/fun.py b/git/objects/fun.py
index 5c0f4819..a144ba7e 100644
--- a/git/objects/fun.py
+++ b/git/objects/fun.py
@@ -76,11 +76,7 @@ def tree_entries_from_data(data):
         # default encoding for strings in git is utf8
         # Only use the respective unicode object if the byte stream was encoded
         name = data[ns:i]
-        try:
-            name = name.decode(defenc)
-        except UnicodeDecodeError:
-            pass
-        # END handle encoding
+        name = name.decode(defenc, 'surrogateescape')
 
         # byte is NULL, get next 20
         i += 1
-- 
cgit v1.2.3