← 返回首页
fix: decode non-utf-8 blob content (#10729) · ietf-tools/datatracker@629ffb1 · GitHub
Skip to content

Navigation Menu

Toggle navigation
Sign in
Appearance settings
Search or jump to...

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Include my email address so I can be contacted

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Resetting focus

Commit 629ffb1

Browse files
fix: decode non-utf-8 blob content (#10729)
* refactor: decode_document_content() utility method * fix: fall back to latin-1 in retrieve_str() * refactor: match structure with retrieve_bytes() * refactor: separate tests_text.py module * test: test_decode_document_content + ruff * fix: revert misguided refactor * test: assert to guarantee test is valid
1 parent c4cb8b9 commit 629ffb1

5 files changed

Lines changed: 114 additions & 56 deletions

File tree

‎ietf/doc/models.py‎

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
from ietf.person.utils import get_active_balloters
5353
from ietf.utils import log
5454
from ietf.utils.decorators import memoize
55+
from ietf.utils.text import decode_document_content
5556
from ietf.utils.validators import validate_no_control_chars
5657
from ietf.utils.mail import formataddr
5758
from ietf.utils.models import ForeignKey
@@ -640,19 +641,7 @@ def text(self, size = -1):
640641
except IOError as e:
641642
log.log(f"Error reading text for {path}: {e}")
642643
return None
643-
text = None
644-
try:
645-
text = raw.decode('utf-8')
646-
except UnicodeDecodeError:
647-
for back in range(1,4):
648-
try:
649-
text = raw[:-back].decode('utf-8')
650-
break
651-
except UnicodeDecodeError:
652-
pass
653-
if text is None:
654-
text = raw.decode('latin-1')
655-
return text
644+
return decode_document_content(raw)
656645

657646
def text_or_error(self):
658647
return self.text() or "Error; cannot read '%s'"%self.get_base_name()

‎ietf/doc/storage_utils.py‎

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from django.core.files.storage import storages, Storage
1111

1212
from ietf.utils.log import log
13+
from ietf.utils.text import decode_document_content
1314

1415

1516
class StorageUtilsError(Exception):
@@ -164,32 +165,30 @@ def store_str(
164165

165166
def retrieve_bytes(kind: str, name: str) -> bytes:
166167
from ietf.doc.storage import maybe_log_timing
167-
content = b""
168-
if settings.ENABLE_BLOBSTORAGE:
169-
try:
170-
store = _get_storage(kind)
171-
with store.open(name) as f:
172-
with maybe_log_timing(
173-
hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing,
174-
"read",
175-
bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "",
176-
name=name,
177-
):
178-
content = f.read()
179-
except Exception as err:
180-
log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}")
181-
raise
168+
if not settings.ENABLE_BLOBSTORAGE:
169+
return b""
170+
try:
171+
store = _get_storage(kind)
172+
with store.open(name) as f:
173+
with maybe_log_timing(
174+
hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing,
175+
"read",
176+
bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "",
177+
name=name,
178+
):
179+
content = f.read()
180+
except Exception as err:
181+
log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}")
182+
raise
182183
return content
183184

184185

185186
def retrieve_str(kind: str, name: str) -> str:
186-
content = ""
187-
if settings.ENABLE_BLOBSTORAGE:
188-
try:
189-
content_bytes = retrieve_bytes(kind, name)
190-
# TODO-BLOBSTORE: try to decode all the different ways doc.text() does
191-
content = content_bytes.decode("utf-8")
192-
except Exception as err:
193-
log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}")
194-
raise
187+
if not settings.ENABLE_BLOBSTORAGE:
188+
return ""
189+
try:
190+
content = decode_document_content(retrieve_bytes(kind, name))
191+
except Exception as err:
192+
log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}")
193+
raise
195194
return content

‎ietf/utils/tests.py‎

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@
6060
set_url_coverage,
6161
)
6262
from ietf.utils.test_utils import TestCase, unicontent
63-
from ietf.utils.text import parse_unicode
6463
from ietf.utils.timezone import timezone_not_near_midnight
6564
from ietf.utils.xmldraft import XMLDraft, InvalidMetadataError, capture_xml2rfc_output
6665

@@ -864,24 +863,6 @@ def test_assertion(self):
864863
assertion('False')
865864
settings.SERVER_MODE = 'test'
866865

867-
class TestRFC2047Strings(TestCase):
868-
def test_parse_unicode(self):
869-
names = (
870-
('=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=', 'ዳዊት በቀለ'),
871-
('=?utf-8?b?5Li9IOmDnA==?=', '丽 郜'),
872-
('=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=', 'कम्बोज डार'),
873-
('=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=', 'Ηράκλεια Λιόντη'),
874-
('=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=', 'ישראל רוזנפלד'),
875-
('=?utf-8?b?5Li95Y2OIOeahw==?=', '丽华 皇'),
876-
('=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=', 'ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ'),
877-
('=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=', 'ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ'),
878-
('=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=', 'Iñigo Sanç Ibáñez de la Peña'),
879-
('Mart van Oostendorp', 'Mart van Oostendorp'),
880-
('', ''),
881-
)
882-
for encoded_str, unicode in names:
883-
self.assertEqual(unicode, parse_unicode(encoded_str))
884-
885866
class TestAndroidSiteManifest(TestCase):
886867
def test_manifest(self):
887868
r = self.client.get(urlreverse('site.webmanifest'))

‎ietf/utils/tests_text.py‎

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright The IETF Trust 2021-2026, All Rights Reserved
2+
from ietf.utils.test_utils import TestCase
3+
from ietf.utils.text import parse_unicode, decode_document_content
4+
5+
6+
class TestDecoders(TestCase):
7+
def test_parse_unicode(self):
8+
names = (
9+
("=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=", "ዳዊት በቀለ"),
10+
("=?utf-8?b?5Li9IOmDnA==?=", "丽 郜"),
11+
("=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=", "कम्बोज डार"),
12+
("=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=", "Ηράκλεια Λιόντη"),
13+
("=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=", "ישראל רוזנפלד"),
14+
("=?utf-8?b?5Li95Y2OIOeahw==?=", "丽华 皇"),
15+
("=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=", "ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ"),
16+
(
17+
"=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=",
18+
"ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ",
19+
),
20+
(
21+
"=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=",
22+
"Iñigo Sanç Ibáñez de la Peña",
23+
),
24+
("Mart van Oostendorp", "Mart van Oostendorp"),
25+
("", ""),
26+
)
27+
for encoded_str, unicode in names:
28+
self.assertEqual(unicode, parse_unicode(encoded_str))
29+
30+
def test_decode_document_content(self):
31+
utf8_bytes = "𒀭𒊩𒌆𒄈𒋢".encode("utf-8") # ends with 4-byte character
32+
latin1_bytes = "àéîøü".encode("latin-1")
33+
other_bytes = "àéîøü".encode("macintosh") # different from its latin-1 encoding
34+
assert other_bytes.decode("macintosh") != other_bytes.decode("latin-1"),\
35+
"test broken: other_bytes must decode differently as latin-1"
36+
37+
# simplest case
38+
self.assertEqual(
39+
decode_document_content(utf8_bytes),
40+
utf8_bytes.decode(),
41+
)
42+
# losing 1-4 bytes from the end leave the last character incomplete; the
43+
# decoder should decode all but that last character
44+
self.assertEqual(
45+
decode_document_content(utf8_bytes[:-1]),
46+
utf8_bytes.decode()[:-1],
47+
)
48+
self.assertEqual(
49+
decode_document_content(utf8_bytes[:-2]),
50+
utf8_bytes.decode()[:-1],
51+
)
52+
self.assertEqual(
53+
decode_document_content(utf8_bytes[:-3]),
54+
utf8_bytes.decode()[:-1],
55+
)
56+
self.assertEqual(
57+
decode_document_content(utf8_bytes[:-4]),
58+
utf8_bytes.decode()[:-1],
59+
)
60+
61+
# latin-1 is also simple
62+
self.assertEqual(
63+
decode_document_content(latin1_bytes),
64+
latin1_bytes.decode("latin-1"),
65+
)
66+
67+
# other character sets are just treated as latin1 (bug? feature? you decide)
68+
self.assertEqual(
69+
decode_document_content(other_bytes),
70+
other_bytes.decode("latin-1"),
71+
)

‎ietf/utils/text.py‎

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,3 +263,21 @@ def parse_unicode(text):
263263
else:
264264
text = decoded_string
265265
return text
266+
267+
268+
def decode_document_content(content: bytes) -> str:
269+
"""Decode document contents as utf-8 or latin1
270+
271+
Method was developed in DocumentInfo.text() where it gave acceptable results
272+
for existing documents / RFCs.
273+
"""
274+
try:
275+
return content.decode("utf-8")
276+
except UnicodeDecodeError:
277+
pass
278+
for back in range(1, 4):
279+
try:
280+
return content[:-back].decode("utf-8")
281+
except UnicodeDecodeError:
282+
pass
283+
return content.decode("latin-1") # everything is legal in latin-1

0 commit comments

Comments
 (0)

Footer

© 2026 GitHub, Inc.