5 files changed
@@ -52,6 +52,7 @@ | |||
| 52 | 52 | from ietf.person.utils import get_active_balloters | |
| 53 | 53 | from ietf.utils import log | |
| 54 | 54 | from ietf.utils.decorators import memoize | |
| 55 | + from ietf.utils.text import decode_document_content | ||
| 55 | 56 | from ietf.utils.validators import validate_no_control_chars | |
| 56 | 57 | from ietf.utils.mail import formataddr | |
| 57 | 58 | from ietf.utils.models import ForeignKey | |
@@ -640,19 +641,7 @@ def text(self, size = -1): | |||
| 640 | 641 | except IOError as e: | |
| 641 | 642 | log.log(f"Error reading text for {path}: {e}") | |
| 642 | 643 | return None | |
| 643 | - text = None | ||
| 644 | - try: | ||
| 645 | - text = raw.decode('utf-8') | ||
| 646 | - except UnicodeDecodeError: | ||
| 647 | - for back in range(1,4): | ||
| 648 | - try: | ||
| 649 | - text = raw[:-back].decode('utf-8') | ||
| 650 | - break | ||
| 651 | - except UnicodeDecodeError: | ||
| 652 | - pass | ||
| 653 | - if text is None: | ||
| 654 | - text = raw.decode('latin-1') | ||
| 655 | - return text | ||
| 644 | + return decode_document_content(raw) | ||
| 656 | 645 | ||
| 657 | 646 | def text_or_error(self): | |
| 658 | 647 | return self.text() or "Error; cannot read '%s'"%self.get_base_name() | |
@@ -10,6 +10,7 @@ | |||
| 10 | 10 | from django.core.files.storage import storages, Storage | |
| 11 | 11 | ||
| 12 | 12 | from ietf.utils.log import log | |
| 13 | + from ietf.utils.text import decode_document_content | ||
| 13 | 14 | ||
| 14 | 15 | ||
| 15 | 16 | class StorageUtilsError(Exception): | |
@@ -164,32 +165,30 @@ def store_str( | |||
| 164 | 165 | ||
| 165 | 166 | def retrieve_bytes(kind: str, name: str) -> bytes: | |
| 166 | 167 | from ietf.doc.storage import maybe_log_timing | |
| 167 | - content = b"" | ||
| 168 | - if settings.ENABLE_BLOBSTORAGE: | ||
| 169 | - try: | ||
| 170 | - store = _get_storage(kind) | ||
| 171 | - with store.open(name) as f: | ||
| 172 | - with maybe_log_timing( | ||
| 173 | - hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing, | ||
| 174 | - "read", | ||
| 175 | - bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "", | ||
| 176 | - name=name, | ||
| 177 | - ): | ||
| 178 | - content = f.read() | ||
| 179 | - except Exception as err: | ||
| 180 | - log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}") | ||
| 181 | - raise | ||
| 168 | + if not settings.ENABLE_BLOBSTORAGE: | ||
| 169 | + return b"" | ||
| 170 | + try: | ||
| 171 | + store = _get_storage(kind) | ||
| 172 | + with store.open(name) as f: | ||
| 173 | + with maybe_log_timing( | ||
| 174 | + hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing, | ||
| 175 | + "read", | ||
| 176 | + bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "", | ||
| 177 | + name=name, | ||
| 178 | + ): | ||
| 179 | + content = f.read() | ||
| 180 | + except Exception as err: | ||
| 181 | + log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}") | ||
| 182 | + raise | ||
| 182 | 183 | return content | |
| 183 | 184 | ||
| 184 | 185 | ||
| 185 | 186 | def retrieve_str(kind: str, name: str) -> str: | |
| 186 | - content = "" | ||
| 187 | - if settings.ENABLE_BLOBSTORAGE: | ||
| 188 | - try: | ||
| 189 | - content_bytes = retrieve_bytes(kind, name) | ||
| 190 | - # TODO-BLOBSTORE: try to decode all the different ways doc.text() does | ||
| 191 | - content = content_bytes.decode("utf-8") | ||
| 192 | - except Exception as err: | ||
| 193 | - log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}") | ||
| 194 | - raise | ||
| 187 | + if not settings.ENABLE_BLOBSTORAGE: | ||
| 188 | + return "" | ||
| 189 | + try: | ||
| 190 | + content = decode_document_content(retrieve_bytes(kind, name)) | ||
| 191 | + except Exception as err: | ||
| 192 | + log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}") | ||
| 193 | + raise | ||
| 195 | 194 | return content | |
@@ -60,7 +60,6 @@ | |||
| 60 | 60 | set_url_coverage, | |
| 61 | 61 | ) | |
| 62 | 62 | from ietf.utils.test_utils import TestCase, unicontent | |
| 63 | - from ietf.utils.text import parse_unicode | ||
| 64 | 63 | from ietf.utils.timezone import timezone_not_near_midnight | |
| 65 | 64 | from ietf.utils.xmldraft import XMLDraft, InvalidMetadataError, capture_xml2rfc_output | |
| 66 | 65 | ||
@@ -864,24 +863,6 @@ def test_assertion(self): | |||
| 864 | 863 | assertion('False') | |
| 865 | 864 | settings.SERVER_MODE = 'test' | |
| 866 | 865 | ||
| 867 | - class TestRFC2047Strings(TestCase): | ||
| 868 | - def test_parse_unicode(self): | ||
| 869 | - names = ( | ||
| 870 | - ('=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=', 'ዳዊት በቀለ'), | ||
| 871 | - ('=?utf-8?b?5Li9IOmDnA==?=', '丽 郜'), | ||
| 872 | - ('=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=', 'कम्बोज डार'), | ||
| 873 | - ('=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=', 'Ηράκλεια Λιόντη'), | ||
| 874 | - ('=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=', 'ישראל רוזנפלד'), | ||
| 875 | - ('=?utf-8?b?5Li95Y2OIOeahw==?=', '丽华 皇'), | ||
| 876 | - ('=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=', 'ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ'), | ||
| 877 | - ('=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=', 'ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ'), | ||
| 878 | - ('=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=', 'Iñigo Sanç Ibáñez de la Peña'), | ||
| 879 | - ('Mart van Oostendorp', 'Mart van Oostendorp'), | ||
| 880 | - ('', ''), | ||
| 881 | - ) | ||
| 882 | - for encoded_str, unicode in names: | ||
| 883 | - self.assertEqual(unicode, parse_unicode(encoded_str)) | ||
| 884 | - | ||
| 885 | 866 | class TestAndroidSiteManifest(TestCase): | |
| 886 | 867 | def test_manifest(self): | |
| 887 | 868 | r = self.client.get(urlreverse('site.webmanifest')) | |
@@ -0,0 +1,71 @@ | |||
| 1 | + # Copyright The IETF Trust 2021-2026, All Rights Reserved | ||
| 2 | + from ietf.utils.test_utils import TestCase | ||
| 3 | + from ietf.utils.text import parse_unicode, decode_document_content | ||
| 4 | + | ||
| 5 | + | ||
| 6 | + class TestDecoders(TestCase): | ||
| 7 | + def test_parse_unicode(self): | ||
| 8 | + names = ( | ||
| 9 | + ("=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=", "ዳዊት በቀለ"), | ||
| 10 | + ("=?utf-8?b?5Li9IOmDnA==?=", "丽 郜"), | ||
| 11 | + ("=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=", "कम्बोज डार"), | ||
| 12 | + ("=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=", "Ηράκλεια Λιόντη"), | ||
| 13 | + ("=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=", "ישראל רוזנפלד"), | ||
| 14 | + ("=?utf-8?b?5Li95Y2OIOeahw==?=", "丽华 皇"), | ||
| 15 | + ("=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=", "ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ"), | ||
| 16 | + ( | ||
| 17 | + "=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=", | ||
| 18 | + "ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ", | ||
| 19 | + ), | ||
| 20 | + ( | ||
| 21 | + "=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=", | ||
| 22 | + "Iñigo Sanç Ibáñez de la Peña", | ||
| 23 | + ), | ||
| 24 | + ("Mart van Oostendorp", "Mart van Oostendorp"), | ||
| 25 | + ("", ""), | ||
| 26 | + ) | ||
| 27 | + for encoded_str, unicode in names: | ||
| 28 | + self.assertEqual(unicode, parse_unicode(encoded_str)) | ||
| 29 | + | ||
| 30 | + def test_decode_document_content(self): | ||
| 31 | + utf8_bytes = "𒀭𒊩𒌆𒄈𒋢".encode("utf-8") # ends with 4-byte character | ||
| 32 | + latin1_bytes = "àéîøü".encode("latin-1") | ||
| 33 | + other_bytes = "àéîøü".encode("macintosh") # different from its latin-1 encoding | ||
| 34 | + assert other_bytes.decode("macintosh") != other_bytes.decode("latin-1"),\ | ||
| 35 | + "test broken: other_bytes must decode differently as latin-1" | ||
| 36 | + | ||
| 37 | + # simplest case | ||
| 38 | + self.assertEqual( | ||
| 39 | + decode_document_content(utf8_bytes), | ||
| 40 | + utf8_bytes.decode(), | ||
| 41 | + ) | ||
| 42 | + # losing 1-4 bytes from the end leave the last character incomplete; the | ||
| 43 | + # decoder should decode all but that last character | ||
| 44 | + self.assertEqual( | ||
| 45 | + decode_document_content(utf8_bytes[:-1]), | ||
| 46 | + utf8_bytes.decode()[:-1], | ||
| 47 | + ) | ||
| 48 | + self.assertEqual( | ||
| 49 | + decode_document_content(utf8_bytes[:-2]), | ||
| 50 | + utf8_bytes.decode()[:-1], | ||
| 51 | + ) | ||
| 52 | + self.assertEqual( | ||
| 53 | + decode_document_content(utf8_bytes[:-3]), | ||
| 54 | + utf8_bytes.decode()[:-1], | ||
| 55 | + ) | ||
| 56 | + self.assertEqual( | ||
| 57 | + decode_document_content(utf8_bytes[:-4]), | ||
| 58 | + utf8_bytes.decode()[:-1], | ||
| 59 | + ) | ||
| 60 | + | ||
| 61 | + # latin-1 is also simple | ||
| 62 | + self.assertEqual( | ||
| 63 | + decode_document_content(latin1_bytes), | ||
| 64 | + latin1_bytes.decode("latin-1"), | ||
| 65 | + ) | ||
| 66 | + | ||
| 67 | + # other character sets are just treated as latin1 (bug? feature? you decide) | ||
| 68 | + self.assertEqual( | ||
| 69 | + decode_document_content(other_bytes), | ||
| 70 | + other_bytes.decode("latin-1"), | ||
| 71 | + ) | ||
@@ -263,3 +263,21 @@ def parse_unicode(text): | |||
| 263 | 263 | else: | |
| 264 | 264 | text = decoded_string | |
| 265 | 265 | return text | |
| 266 | + | ||
| 267 | + | ||
| 268 | + def decode_document_content(content: bytes) -> str: | ||
| 269 | + """Decode document contents as utf-8 or latin1 | ||
| 270 | + | ||
| 271 | + Method was developed in DocumentInfo.text() where it gave acceptable results | ||
| 272 | + for existing documents / RFCs. | ||
| 273 | + """ | ||
| 274 | + try: | ||
| 275 | + return content.decode("utf-8") | ||
| 276 | + except UnicodeDecodeError: | ||
| 277 | + pass | ||
| 278 | + for back in range(1, 4): | ||
| 279 | + try: | ||
| 280 | + return content[:-back].decode("utf-8") | ||
| 281 | + except UnicodeDecodeError: | ||
| 282 | + pass | ||
| 283 | + return content.decode("latin-1") # everything is legal in latin-1 | ||
0 commit comments