Truncate UTF-8 in a way that produces valid UTF-8.
This commit is contained in:
parent
898f79a951
commit
0dab872289
@ -73,9 +73,25 @@ class EndianIO:
|
||||
:returns: the number of bytes written.
|
||||
"""
|
||||
assert isinstance(astring, str)
|
||||
# TODO: truncate the string on a UTF-8 character boundary to avoid creating invalid UTF-8.
|
||||
encoded = astring.encode('utf-8')[:fieldlen-1] + b'\0'
|
||||
return fileobj.write(encoded)
|
||||
encoded = astring.encode('utf-8')
|
||||
|
||||
# Take into account we also need space for a trailing 0-byte.
|
||||
maxlen = fieldlen - 1
|
||||
|
||||
if len(encoded) >= maxlen:
|
||||
encoded = encoded[:maxlen]
|
||||
|
||||
# Keep stripping off the last byte until the string
|
||||
# is valid UTF-8 again.
|
||||
while True:
|
||||
try:
|
||||
encoded.decode('utf8')
|
||||
except UnicodeDecodeError:
|
||||
encoded = encoded[:-1]
|
||||
else:
|
||||
break
|
||||
|
||||
return fileobj.write(encoded + b'\0')
|
||||
|
||||
@classmethod
|
||||
def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int:
|
||||
|
||||
24
tests/test_blendfile_dna_io.py
Normal file
24
tests/test_blendfile_dna_io.py
Normal file
@ -0,0 +1,24 @@
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from blender_asset_tracer.blendfile import dna, dna_io
|
||||
|
||||
|
||||
class StringTest(unittest.TestCase):
|
||||
def test_trim_utf8(self):
|
||||
fileobj = mock.Mock()
|
||||
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, so the last
|
||||
# character won't fit in the field (due to the 0-byte required).
|
||||
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 15)
|
||||
|
||||
expect_bytes = ('බියර්'[:-1]).encode('utf8') + b'\0'
|
||||
fileobj.write.assert_called_with(expect_bytes)
|
||||
|
||||
def test_utf8(self):
|
||||
fileobj = mock.Mock()
|
||||
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8,
|
||||
# so with the 0-byte it just fits.
|
||||
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 16)
|
||||
|
||||
expect_bytes = 'බියර්'.encode('utf8') + b'\0'
|
||||
fileobj.write.assert_called_with(expect_bytes)
|
||||
Loading…
x
Reference in New Issue
Block a user