diff --git a/blender_asset_tracer/blendfile/dna_io.py b/blender_asset_tracer/blendfile/dna_io.py index 16e561c..5ffbfc0 100644 --- a/blender_asset_tracer/blendfile/dna_io.py +++ b/blender_asset_tracer/blendfile/dna_io.py @@ -73,9 +73,25 @@ class EndianIO: :returns: the number of bytes written. """ assert isinstance(astring, str) - # TODO: truncate the string on a UTF-8 character boundary to avoid creating invalid UTF-8. - encoded = astring.encode('utf-8')[:fieldlen-1] + b'\0' - return fileobj.write(encoded) + encoded = astring.encode('utf-8') + + # Take into account we also need space for a trailing 0-byte. + maxlen = fieldlen - 1 + + if len(encoded) >= maxlen: + encoded = encoded[:maxlen] + + # Keep stripping off the last byte until the string + # is valid UTF-8 again. + while True: + try: + encoded.decode('utf8') + except UnicodeDecodeError: + encoded = encoded[:-1] + else: + break + + return fileobj.write(encoded + b'\0') @classmethod def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int: diff --git a/tests/test_blendfile_dna_io.py b/tests/test_blendfile_dna_io.py new file mode 100644 index 0000000..0662315 --- /dev/null +++ b/tests/test_blendfile_dna_io.py @@ -0,0 +1,24 @@ +import unittest +from unittest import mock + +from blender_asset_tracer.blendfile import dna, dna_io + + +class StringTest(unittest.TestCase): + def test_trim_utf8(self): + fileobj = mock.Mock() + # Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, so the last + # character won't fit in the field (due to the 0-byte required). + dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 15) + + expect_bytes = ('බියර්'[:-1]).encode('utf8') + b'\0' + fileobj.write.assert_called_with(expect_bytes) + + def test_utf8(self): + fileobj = mock.Mock() + # Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, + # so with the 0-byte it just fits. + dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 16) + + expect_bytes = 'බියර්'.encode('utf8') + b'\0' + fileobj.write.assert_called_with(expect_bytes)