Truncate UTF-8 in a way that produces valid UTF-8.

This commit is contained in:
Sybren A. Stüvel 2018-03-08 14:47:41 +01:00
parent 898f79a951
commit 0dab872289
2 changed files with 43 additions and 3 deletions

View File

@ -73,9 +73,25 @@ class EndianIO:
:returns: the number of bytes written. :returns: the number of bytes written.
""" """
assert isinstance(astring, str) assert isinstance(astring, str)
# TODO: truncate the string on a UTF-8 character boundary to avoid creating invalid UTF-8. encoded = astring.encode('utf-8')
encoded = astring.encode('utf-8')[:fieldlen-1] + b'\0'
return fileobj.write(encoded) # Take into account we also need space for a trailing 0-byte.
maxlen = fieldlen - 1
if len(encoded) >= maxlen:
encoded = encoded[:maxlen]
# Keep stripping off the last byte until the string
# is valid UTF-8 again.
while True:
try:
encoded.decode('utf8')
except UnicodeDecodeError:
encoded = encoded[:-1]
else:
break
return fileobj.write(encoded + b'\0')
@classmethod @classmethod
def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int: def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int:

View File

@ -0,0 +1,24 @@
import unittest
from unittest import mock
from blender_asset_tracer.blendfile import dna, dna_io
class StringTest(unittest.TestCase):
def test_trim_utf8(self):
fileobj = mock.Mock()
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, so the last
# character won't fit in the field (due to the 0-byte required).
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 15)
expect_bytes = ('බියර්'[:-1]).encode('utf8') + b'\0'
fileobj.write.assert_called_with(expect_bytes)
def test_utf8(self):
fileobj = mock.Mock()
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8,
# so with the 0-byte it just fits.
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 16)
expect_bytes = 'බියර්'.encode('utf8') + b'\0'
fileobj.write.assert_called_with(expect_bytes)