Attempt UTF-8 first when decoding/encoding

This fixes a nagging issue on Windows where the detected locale (cp1252
when running from the command prompt) decodes without error, but
produces incorrect results. For example:

>>> u'Ψ'
u'\u03a8'
>>> u'Ψ'.encode('utf-8').decode('utf-8')
u'\u03a8'
>>> u'Ψ'.encode('utf-8').decode('cp1252')
u'\xce\xa8'
>>> u'Д'
u'\u0414'
>>> u'Д'.encode('utf-8').decode('utf-8')
u'\u0414'
>>> u'Д'.encode('utf-8').decode('cp1252')
u'\xd0\u201d'

Attempting UTF-8 first keeps Windows from producing incorrectly
decoded/encoded results, while still letting the code fall back to using
cp1252 if the decode/encode fails.
This commit is contained in:
Erik Johnson 2018-04-04 13:07:01 -05:00
parent a0e168ccee
commit 22ff48518f
No known key found for this signature in database
GPG key ID: 5E5583C437808F3F

View file

@ -41,10 +41,11 @@ def to_bytes(s, encoding=None, errors='strict'):
return s.encode(encoding, errors)
else:
try:
return s.encode(__salt_system_encoding__, errors)
except UnicodeEncodeError:
# Fall back to UTF-8
# Try UTF-8 first
return s.encode('utf-8', errors)
except UnicodeEncodeError:
# Fall back to detected encoding
return s.encode(__salt_system_encoding__, errors)
raise TypeError('expected bytes, bytearray, or str')
else:
return to_str(s, encoding, errors)
@ -64,10 +65,11 @@ def to_str(s, encoding=None, errors='strict'):
return s.decode(encoding, errors)
else:
try:
return s.decode(__salt_system_encoding__, errors)
except UnicodeDecodeError:
# Fall back to UTF-8
# Try UTF-8 first
return s.decode('utf-8', errors)
except UnicodeDecodeError:
# Fall back to detected encoding
return s.decode(__salt_system_encoding__, errors)
raise TypeError('expected str, bytes, or bytearray not {}'.format(type(s)))
else:
if isinstance(s, bytearray):
@ -77,10 +79,11 @@ def to_str(s, encoding=None, errors='strict'):
return s.encode(encoding, errors)
else:
try:
return s.encode(__salt_system_encoding__, errors)
except UnicodeEncodeError:
# Fall back to UTF-8
# Try UTF-8 first
return s.encode('utf-8', errors)
except UnicodeEncodeError:
# Fall back to detected encoding
return s.encode(__salt_system_encoding__, errors)
raise TypeError('expected str, bytearray, or unicode')
@ -108,10 +111,11 @@ def to_unicode(s, encoding=None, errors='strict', normalize=False):
return _normalize(s.decode(encoding, errors))
else:
try:
return _normalize(s.decode(__salt_system_encoding__, errors))
except UnicodeDecodeError:
# Fall back to UTF-8
# Try UTF-8 first
return _normalize(s.decode('utf-8', errors))
except UnicodeDecodeError:
# Fall back to detected encoding
return _normalize(s.decode(__salt_system_encoding__, errors))
raise TypeError('expected str or bytearray')