Truncate UTF-8 in a way that produces valid UTF-8.
This commit is contained in:
parent
898f79a951
commit
0dab872289
@ -73,9 +73,25 @@ class EndianIO:
|
|||||||
:returns: the number of bytes written.
|
:returns: the number of bytes written.
|
||||||
"""
|
"""
|
||||||
assert isinstance(astring, str)
|
assert isinstance(astring, str)
|
||||||
# TODO: truncate the string on a UTF-8 character boundary to avoid creating invalid UTF-8.
|
encoded = astring.encode('utf-8')
|
||||||
encoded = astring.encode('utf-8')[:fieldlen-1] + b'\0'
|
|
||||||
return fileobj.write(encoded)
|
# Take into account we also need space for a trailing 0-byte.
|
||||||
|
maxlen = fieldlen - 1
|
||||||
|
|
||||||
|
if len(encoded) >= maxlen:
|
||||||
|
encoded = encoded[:maxlen]
|
||||||
|
|
||||||
|
# Keep stripping off the last byte until the string
|
||||||
|
# is valid UTF-8 again.
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
encoded.decode('utf8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
encoded = encoded[:-1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return fileobj.write(encoded + b'\0')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int:
|
def write_bytes(cls, fileobj: typing.BinaryIO, data: bytes, fieldlen: int) -> int:
|
||||||
|
|||||||
24
tests/test_blendfile_dna_io.py
Normal file
24
tests/test_blendfile_dna_io.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import unittest
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from blender_asset_tracer.blendfile import dna, dna_io
|
||||||
|
|
||||||
|
|
||||||
|
class StringTest(unittest.TestCase):
|
||||||
|
def test_trim_utf8(self):
|
||||||
|
fileobj = mock.Mock()
|
||||||
|
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8, so the last
|
||||||
|
# character won't fit in the field (due to the 0-byte required).
|
||||||
|
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 15)
|
||||||
|
|
||||||
|
expect_bytes = ('බියර්'[:-1]).encode('utf8') + b'\0'
|
||||||
|
fileobj.write.assert_called_with(expect_bytes)
|
||||||
|
|
||||||
|
def test_utf8(self):
|
||||||
|
fileobj = mock.Mock()
|
||||||
|
# Sinhala for 'beer'. This is exactly 15 bytes in UTF-8,
|
||||||
|
# so with the 0-byte it just fits.
|
||||||
|
dna_io.BigEndianTypes.write_string(fileobj, 'බියර්', 16)
|
||||||
|
|
||||||
|
expect_bytes = 'බියර්'.encode('utf8') + b'\0'
|
||||||
|
fileobj.write.assert_called_with(expect_bytes)
|
||||||
Loading…
x
Reference in New Issue
Block a user