Truncate UTF-8 in a way that produces valid UTF-8.

2018-03-08 14:47:41 +01:00 · 2018-03-08 14:47:41 +01:00 · 0dab872289
commit 0dab872289
parent 898f79a951
2 changed files with 43 additions and 3 deletions
--- a/blender_asset_tracer/blendfile/dna_io.py
+++ b/blender_asset_tracer/blendfile/dna_io.py
@ -73,9 +73,25 @@ class EndianIO:
        :returns: the number of bytes written.
        """
        assert isinstance(astring, str)
-        # TODO: truncate the string on a UTF-8 character boundary to avoid creating invalid UTF-8.
+        encoded = astring.encode('utf-8')
-        encoded = astring.encode('utf-8')[:fieldlen-1] + b'\0'
+
-        return fileobj.write(encoded)
+        # Take into account we also need space for a trailing 0-byte.
        maxlen = fieldlen - 1
        if len(encoded) >= maxlen:
            encoded = encoded[:maxlen]
            # Keep stripping off the last byte until the string
            # is valid UTF-8 again.
            while True:
                try:
                    encoded.decode('utf8')
                except UnicodeDecodeError:
                    encoded = encoded[:-1]
                else:
                    break
        return fileobj.write(encoded + b'\0')
    @classmethod
    def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int:
--- a/tests/test_blendfile_dna_io.py
+++ b/tests/test_blendfile_dna_io.py
@ -0,0 +1,24 @@
 import unittest
 from unittest import mock
 from blender_asset_tracer.blendfile import dna, dna_io
 class StringTest(unittest.TestCase):
    def test_trim_utf8(self):
        fileobj = mock.Mock()
        # Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, so the last
        # character won't fit in the field (due to the 0-byte required).
        dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 15)
        expect_bytes = ('බියර්'[:-1]).encode('utf8') + b'\0'
        fileobj.write.assert_called_with(expect_bytes)
    def test_utf8(self):
        fileobj = mock.Mock()
        # Sinhala for 'beer'. This is exactly 15 bytes in UTF-8,
        # so with the 0-byte it just fits.
        dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 16)
        expect_bytes = 'බියර්'.encode('utf8') + b'\0'
        fileobj.write.assert_called_with(expect_bytes)