From 716bff0d7f8fe728baf35584a4742700a8aa04d8 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Thu, 7 Oct 2021 06:44:38 -0400 Subject: [PATCH 01/11] Add application/x-www-formurlencoded transcoder. I ended up not using urllib.parse functions since they do not implement the specification. The only difference is that the specification requires that "~" is encoded. NB - this commit is incomplete since it does not handle calling the transcoder on simple objects. mypy will quite correctly fail. --- docs/api.rst | 6 + docs/history.rst | 3 + setup.cfg | 3 + sprockets/mixins/mediatype/transcoders.py | 165 ++++++++++++++++++++++ tests.py | 103 +++++++++++++- 5 files changed, 279 insertions(+), 1 deletion(-) diff --git a/docs/api.rst b/docs/api.rst index b1b0f87..c324972 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -54,6 +54,12 @@ Bundled Transcoders .. autoclass:: MsgPackTranscoder :members: +.. autoclass:: FormUrlEncodedTranscoder + :members: + +.. autoclass:: FormUrlEncodingOptions + :members: + .. _type-info: Python Type Information diff --git a/docs/history.rst b/docs/history.rst index 818ea42..f02c664 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -3,12 +3,15 @@ Version History :compare:`Next <3.0.4...master>` -------------------------------- +- Add a transcoder for `application/x-www-formurlencoded`_ - Add type annotations (see :ref:`type-info`) - Return a "406 Not Acceptable" if the :http:header:`Accept` header values cannot be matched and there is no default content type configured - Deprecate not having a default content type configured - Fail gracefully when a transcoder does not exist for the default content type +.. _application/x-www-formurlencoded: https://url.spec.whatwg.org/#application/x-www-form-urlencoded + :compare:`3.0.4 <3.0.3...3.0.4>` (2 Nov 2020) --------------------------------------------- - Return a "400 Bad Request" when an invalid Content-Type header is received diff --git a/setup.cfg b/setup.cfg index 7086654..bdd866b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -87,3 +87,6 @@ exclude = build,env,.eggs [mypy] mypy_path = typestubs strict = True + +[yapf] +allow_split_before_dict_value = False diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 1f8d2aa..c86891a 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -3,13 +3,17 @@ Bundled media type transcoders. - :class:`.JSONTranscoder` implements JSON encoding/decoding - :class:`.MsgPackTranscoder` implements msgpack encoding/decoding +- :class:`.FormUrlEncodedTranscoder` implements the venerable form encoding """ from __future__ import annotations import base64 +import dataclasses import json +import string import typing +import urllib.parse import uuid import collections.abc @@ -21,6 +25,14 @@ except ImportError: # pragma: no cover from sprockets.mixins.mediatype import handlers, type_info +_FORM_URLENCODING = {c: '%{:02X}'.format(c) for c in range(0, 255)} +_FORM_URLENCODING.update({ord(c): c for c in string.ascii_letters}) +_FORM_URLENCODING.update({ord(c): c for c in string.digits}) +_FORM_URLENCODING.update({ord(c): c for c in '*-_.'}) + +_FORM_URLENCODING_PLUS = _FORM_URLENCODING.copy() +_FORM_URLENCODING_PLUS[ord(' ')] = '+' + class JSONTranscoder(handlers.TextContentHandler): """ @@ -238,3 +250,156 @@ class MsgPackTranscoder(handlers.BinaryContentHandler): raise TypeError('{} is not msgpackable'.format( datum.__class__.__name__)) + + +@dataclasses.dataclass +class FormUrlEncodingOptions: + """Configuration knobs for :class:`.FormUrlEncodedTranscoder`""" + encoding: str = 'utf-8' + """Encoding use when generating the byte stream from character data.""" + + literal_mapping: dict[typing.Literal[None, True, False], + str] = dataclasses.field(default_factory=lambda: { + None: '', + True: 'true', + False: 'false' + }) + """Mapping from supported literal values to strings.""" + + space_as_plus: bool = False + """Quote spaces as ``%20`` or ``+``.""" + + +class FormUrlEncodedTranscoder: + """Opinionated transcoder for the venerable x-www-formurlencoded. + + This transcoder implements transcoding according to the current + W3C documentation. + + * character strings are encoded as UTF-8 codepoints before + percent-encoding the resulting bytes + * the space character is represented as ``%20`` + * :data:`False` is represented as ``false`` + * :data:`True` is represented as ``true`` + * :data:`None` is represented as the empty string + + Some of the opinions can be changed by modifying ``self.options``. + + https://url.spec.whatwg.org/#application/x-www-form-urlencoded + + .. attribute:: options + :type: FormUrlEncodingOptions + + Controls the behavior of the transcoder + + """ + content_type = 'application/x-www-formurlencoded' + + def __init__(self) -> None: + self.options = FormUrlEncodingOptions() + + def to_bytes( + self, + inst_data: type_info.Serializable, + encoding: typing.Optional[str] = None) -> typing.Tuple[str, bytes]: + """Serialize `inst_data` into a byte stream and content type spec. + + :param inst_data: the data to serialize + :param encoding: optional encoding override + + Serialization is implemented as described in the W3C + `urlencoded serialization`_ algorithm. The :attr:`.options` + attribute controls the configurable details of the encoding + process. + + The character encoding can be further overridden by specifying the + `encoding` parameter. + + :returns: tuple of the content type and the resulting bytes + :raises: :exc:`TypeError` if a supplied value cannot be serialized + + .. _urlencoded serialization: https://url.spec.whatwg.org/ + #urlencoded-serializing + + """ + # Generate a sequence of name+value tuples to encode + if isinstance(inst_data, collections.abc.Mapping): + tuples = ((self._normalize(a), self._normalize(b)) + for a, b in inst_data.items()) + else: + tuples = ((self._normalize(a), self._normalize(b)) + for a, b in inst_data) + + # Encode each pair and run the encoded form through the + # appropriate octet to string mapping table + chr_map: typing.Mapping[int, str] + chr_map = (_FORM_URLENCODING_PLUS + if self.options.space_as_plus else _FORM_URLENCODING) + if encoding is None: + encoding = self.options.encoding + prefix = '' # micro-optimization removes if statement from inner loop + buf = [] + for name, value in tuples: + buf.append(prefix) + buf.extend(chr_map[c] for c in name.encode(encoding)) + buf.append('=') + buf.extend(chr_map[c] for c in value.encode(encoding)) + prefix = '&' + + return self.content_type, ''.join(buf).encode('ascii') + + def from_bytes( + self, + data_bytes: bytes, + encoding: typing.Optional[str] = None) -> type_info.Deserialized: + """Deserialize `bytes` into a Python object instance. + + :param data_bytes: byte string to deserialize + :param encoding: optional encoding override + + Deserialization is implemented according to the W3C + `urlencoded deserialization`_ algorithm. The :attr:`.options` + attribute controls the configurable details of the encoding + process. + + :returns: the decoded Python object + + .. _urlencoded deserialization: https://url.spec.whatwg.org/ + #urlencoded-parsing + + """ + dequote = (urllib.parse.unquote_plus + if self.options.space_as_plus else urllib.parse.unquote) + if encoding is None: + encoding = self.options.encoding + + output = [] + for part in data_bytes.decode('ascii').split('&'): + if not part: + continue + name, eq_present, value = part.partition('=') + name = dequote(name, encoding=encoding) + if eq_present: + output.append((name, dequote(value, encoding=encoding))) + else: + output.append((name, '')) + + return dict(output) + + def _normalize( + self, datum: typing.Union[bool, None, float, int, str, + type_info.DefinesIsoFormat] + ) -> str: + try: + datum = self.options.literal_mapping[datum] # type: ignore + except (KeyError, TypeError): + if isinstance(datum, (float, int, str)): + datum = str(datum) + elif hasattr(datum, 'isoformat'): + datum = datum.isoformat() + else: + raise TypeError( + f'{datum.__class__.__name__} is not serializable' + ) from None + + return datum diff --git a/tests.py b/tests.py index f2ea89a..4ad5eca 100644 --- a/tests.py +++ b/tests.py @@ -1,6 +1,7 @@ import base64 import datetime import json +import math import os import pickle import struct @@ -12,7 +13,8 @@ from ietfparse import algorithms from tornado import httputil, testing, web import umsgpack -from sprockets.mixins.mediatype import content, handlers, transcoders +from sprockets.mixins.mediatype import (content, handlers, transcoders, + type_info) import examples @@ -520,3 +522,102 @@ class MsgPackTranscoderTests(unittest.TestCase): new_callable=lambda: None): with self.assertRaises(RuntimeError): transcoders.MsgPackTranscoder() + + +class FormUrlEncodingTranscoderTests(unittest.TestCase): + transcoder: type_info.Transcoder + + def setUp(self): + super().setUp() + self.transcoder = transcoders.FormUrlEncodedTranscoder() + + def test_simple_deserialization(self): + body = self.transcoder.from_bytes( + b'number=12&boolean=true&null=null&string=anything%20really&empty=' + ) + self.assertEqual(body['number'], '12') + self.assertEqual(body['boolean'], 'true') + self.assertEqual(body['empty'], '') + self.assertEqual(body['null'], 'null') + self.assertEqual(body['string'], 'anything really') + + def test_deserialization_edge_cases(self): + body = self.transcoder.from_bytes(b'') + self.assertEqual({}, body) + + body = self.transcoder.from_bytes(b'&') + self.assertEqual({}, body) + + body = self.transcoder.from_bytes(b'empty&&=no-name&no-value=') + self.assertEqual({'empty': '', '': 'no-name', 'no-value': ''}, body) + + body = self.transcoder.from_bytes(b'repeated=1&repeated=2') + self.assertEqual({'repeated': '2'}, body) + + def test_that_deserialization_encoding_can_be_overridden(self): + body = self.transcoder.from_bytes(b'kolor=%bf%F3%b3ty', + encoding='iso-8859-2') + self.assertEqual({'kolor': 'żółty'}, body) + + def test_simple_serialization(self): + now = datetime.datetime.now() + content_type, result = self.transcoder.to_bytes({ + 'integer': 12, + 'float': math.pi, + 'string': 'percent quoted', + 'datetime': now, + }) + self.assertEqual(content_type, 'application/x-www-formurlencoded') + self.assertEqual( + result.decode(), '&'.join([ + 'integer=12', + f'float={math.pi}', + 'string=percent%20quoted', + 'datetime=' + now.isoformat().replace(':', '%3A'), + ])) + + def test_that_serialization_encoding_can_be_overridden(self): + _, result = self.transcoder.to_bytes([('kolor', 'żółty')], + encoding='iso-8859-2') + self.assertEqual(b'kolor=%bf%f3%b3ty', result.lower()) + + def test_serialization_edge_cases(self): + _, result = self.transcoder.to_bytes([ + ('', ''), + ('', True), + ('', False), + ('', None), + ('name', None), + ]) + self.assertEqual(b'=&=true&=false&=&name=', result) + + def test_serialization_using_plusses(self): + self.transcoder: transcoders.FormUrlEncodedTranscoder + + self.transcoder.options.space_as_plus = True + _, result = self.transcoder.to_bytes({'value': 'with space'}) + self.assertEqual(b'value=with+space', result) + + self.transcoder.options.space_as_plus = False + _, result = self.transcoder.to_bytes({'value': 'with space'}) + self.assertEqual(b'value=with%20space', result) + + def test_that_serializing_unsupported_types_fails(self): + with self.assertRaises(TypeError): + self.transcoder.to_bytes({'unsupported': object()}) + + def test_that_required_octets_are_encoded(self): + # build the set of all characters required to be encoded by + # https://url.spec.whatwg.org/#percent-encoded-bytes + pct_chrs = typing.cast(typing.Set[str], set()) + pct_chrs.update({c for c in ' "#<>'}) # query set + pct_chrs.update({c for c in '?`{}'}) # path set + pct_chrs.update({c for c in '/:;=@[^|'}) # userinfo set + pct_chrs.update({c for c in '$%&+,'}) # component set + pct_chrs.update({c for c in "!'()~"}) # formurlencoding set + + test_string = ''.join(pct_chrs) + expected = ''.join('%{:02X}'.format(ord(c)) for c in test_string) + expected = f'test_string={expected}'.encode() + _, result = self.transcoder.to_bytes({'test_string': test_string}) + self.assertEqual(expected, result) From 3aa08b74354b414ec0f9adfa6dcf31499cb735aa Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Fri, 8 Oct 2021 06:53:41 -0400 Subject: [PATCH 02/11] Implement serialization of primitive values. This isn't covered by the serialization specification but is accounted for in the deserialization side. --- sprockets/mixins/mediatype/transcoders.py | 41 +++++++++++++---------- sprockets/mixins/mediatype/type_info.py | 4 +++ tests.py | 15 +++++++++ 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index c86891a..c28042c 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -322,28 +322,34 @@ class FormUrlEncodedTranscoder: #urlencoded-serializing """ - # Generate a sequence of name+value tuples to encode - if isinstance(inst_data, collections.abc.Mapping): - tuples = ((self._normalize(a), self._normalize(b)) - for a, b in inst_data.items()) - else: - tuples = ((self._normalize(a), self._normalize(b)) - for a, b in inst_data) - - # Encode each pair and run the encoded form through the - # appropriate octet to string mapping table + # Select the appropriate encoding table and use the default + # character encoding if necessary. Binding these to locals + # removes branches from the inner loop. chr_map: typing.Mapping[int, str] chr_map = (_FORM_URLENCODING_PLUS if self.options.space_as_plus else _FORM_URLENCODING) if encoding is None: encoding = self.options.encoding + + # Generate a sequence of name+value tuples to encode + if isinstance(inst_data, type_info.SerializablePrimitives): + encoded = self._encode(inst_data, chr_map, encoding) + return self.content_type, encoded.encode('ascii') + + if isinstance(inst_data, collections.abc.Mapping): + tuples = inst_data.items() + else: + tuples = inst_data + + # Encode each pair and run the encoded form through the + # appropriate octet to string mapping table prefix = '' # micro-optimization removes if statement from inner loop buf = [] for name, value in tuples: buf.append(prefix) - buf.extend(chr_map[c] for c in name.encode(encoding)) + buf.extend(self._encode(name, chr_map, encoding)) buf.append('=') - buf.extend(chr_map[c] for c in value.encode(encoding)) + buf.extend(self._encode(value, chr_map, encoding)) prefix = '&' return self.content_type, ''.join(buf).encode('ascii') @@ -386,10 +392,9 @@ class FormUrlEncodedTranscoder: return dict(output) - def _normalize( - self, datum: typing.Union[bool, None, float, int, str, - type_info.DefinesIsoFormat] - ) -> str: + def _encode(self, datum: typing.Union[bool, None, float, int, str, + type_info.DefinesIsoFormat], + char_map: typing.Mapping[int, str], encoding: str) -> str: try: datum = self.options.literal_mapping[datum] # type: ignore except (KeyError, TypeError): @@ -397,9 +402,11 @@ class FormUrlEncodedTranscoder: datum = str(datum) elif hasattr(datum, 'isoformat'): datum = datum.isoformat() + elif isinstance(datum, (bytearray, bytes, memoryview)): + return ''.join(char_map[c] for c in datum) else: raise TypeError( f'{datum.__class__.__name__} is not serializable' ) from None - return datum + return ''.join(char_map[c] for c in datum.encode(encoding)) diff --git a/sprockets/mixins/mediatype/type_info.py b/sprockets/mixins/mediatype/type_info.py index 06e0e72..4b63a3a 100644 --- a/sprockets/mixins/mediatype/type_info.py +++ b/sprockets/mixins/mediatype/type_info.py @@ -24,6 +24,10 @@ class HasSettings(Protocol): """Application settings.""" +SerializablePrimitives = (type(None), bool, bytearray, bytes, float, int, + memoryview, str, uuid.UUID) +"""Use this with isinstance to identify simple values.""" + Serializable = typing.Union[DefinesIsoFormat, None, bool, bytearray, bytes, float, int, memoryview, str, typing.Mapping, typing.Sequence, typing.Set, uuid.UUID] diff --git a/tests.py b/tests.py index 4ad5eca..dd6d846 100644 --- a/tests.py +++ b/tests.py @@ -621,3 +621,18 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): expected = f'test_string={expected}'.encode() _, result = self.transcoder.to_bytes({'test_string': test_string}) self.assertEqual(expected, result) + + def test_serialization_of_primitives(self): + expectations = { + None: b'', + 'a string': b'a%20string', + 10: b'10', + 2.3: str(2.3).encode(), + True: b'true', + False: b'false', + b'\xfe\xed\xfa\xce': b'%FE%ED%FA%CE', + memoryview(b'\xfe\xed\xfa\xce'): b'%FE%ED%FA%CE', + } + for value, expected in expectations.items(): + _, result = self.transcoder.to_bytes(value) + self.assertEqual(expected, result) From e99d41a8b4141d2bc6faf157bd6f8f0cd8784a4d Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Fri, 8 Oct 2021 07:03:14 -0400 Subject: [PATCH 03/11] Refuse to serialize None, True, False. If someone explicitly removes the literal mappings, then refuse to serialize None, True, and False instead of guessing. Note that I explicitly included True and False in the branch logic since bool is a subclass of int but str(True) != str(int(True)) :/ --- sprockets/mixins/mediatype/transcoders.py | 4 ++++ tests.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index c28042c..54428c1 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -398,6 +398,10 @@ class FormUrlEncodedTranscoder: try: datum = self.options.literal_mapping[datum] # type: ignore except (KeyError, TypeError): + if datum in {None, True, False}: + raise TypeError( + f'{datum.__class__.__name__} is not serializable' + ) from None if isinstance(datum, (float, int, str)): datum = str(datum) elif hasattr(datum, 'isoformat'): diff --git a/tests.py b/tests.py index dd6d846..2a797e0 100644 --- a/tests.py +++ b/tests.py @@ -636,3 +636,10 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): for value, expected in expectations.items(): _, result = self.transcoder.to_bytes(value) self.assertEqual(expected, result) + + def test_serialization_with_empty_literal_map(self): + self.transcoder: transcoders.FormUrlEncodedTranscoder + self.transcoder.options.literal_mapping.clear() + for value in {None, True, False}: + with self.assertRaises(TypeError): + self.transcoder.to_bytes(value) From 4fd3864c04e337053141149469cade566188c162 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Fri, 8 Oct 2021 07:50:06 -0400 Subject: [PATCH 04/11] Rejected serialization of non-pair sequences. This required a bit of change in _encode to detect non-pair sequences that are also not strings or byte strings. --- sprockets/mixins/mediatype/transcoders.py | 54 ++++++++++++++--------- tests.py | 8 +++- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 54428c1..07a4d78 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -323,36 +323,35 @@ class FormUrlEncodedTranscoder: """ # Select the appropriate encoding table and use the default - # character encoding if necessary. Binding these to locals - # removes branches from the inner loop. + # character encoding if necessary. Binding these to local + # names removes branches from the inner loop. chr_map: typing.Mapping[int, str] chr_map = (_FORM_URLENCODING_PLUS if self.options.space_as_plus else _FORM_URLENCODING) if encoding is None: encoding = self.options.encoding - # Generate a sequence of name+value tuples to encode - if isinstance(inst_data, type_info.SerializablePrimitives): - encoded = self._encode(inst_data, chr_map, encoding) - return self.content_type, encoded.encode('ascii') + # Generate a sequence of name+value tuples to encode or + # directly encode primitives + try: + tuples = self._convert_to_tuple_sequence(inst_data) + except TypeError: + # hopefully this is a primitive ... if not then the + # call to _encode will fail below + tuples = [(inst_data, None)] - if isinstance(inst_data, collections.abc.Mapping): - tuples = inst_data.items() - else: - tuples = inst_data - - # Encode each pair and run the encoded form through the - # appropriate octet to string mapping table - prefix = '' # micro-optimization removes if statement from inner loop + prefix = '' # another micro-optimization buf = [] for name, value in tuples: buf.append(prefix) buf.extend(self._encode(name, chr_map, encoding)) - buf.append('=') - buf.extend(self._encode(value, chr_map, encoding)) + if value is not None: + buf.append('=') + buf.extend(self._encode(value, chr_map, encoding)) prefix = '&' + encoded = ''.join(buf) - return self.content_type, ''.join(buf).encode('ascii') + return self.content_type, encoded.encode('ascii') def from_bytes( self, @@ -398,19 +397,32 @@ class FormUrlEncodedTranscoder: try: datum = self.options.literal_mapping[datum] # type: ignore except (KeyError, TypeError): - if datum in {None, True, False}: + if isinstance(datum, (bytearray, bytes, memoryview)): + return ''.join(char_map[c] for c in datum) + + if datum is None or isinstance(datum, bool): raise TypeError( f'{datum.__class__.__name__} is not serializable' ) from None + if isinstance(datum, (float, int, str)): datum = str(datum) - elif hasattr(datum, 'isoformat'): + elif datum is not None and hasattr(datum, 'isoformat'): datum = datum.isoformat() - elif isinstance(datum, (bytearray, bytes, memoryview)): - return ''.join(char_map[c] for c in datum) else: raise TypeError( f'{datum.__class__.__name__} is not serializable' ) from None return ''.join(char_map[c] for c in datum.encode(encoding)) + + @staticmethod + def _convert_to_tuple_sequence( + value: type_info.Serializable + ) -> typing.Iterable[typing.Tuple[typing.Any, typing.Any]]: + if isinstance(value, collections.abc.Mapping): + return value.items() + try: + return [(a, b) for a, b in value] # type: ignore + except (TypeError, ValueError): + raise TypeError diff --git a/tests.py b/tests.py index 2a797e0..553d3f1 100644 --- a/tests.py +++ b/tests.py @@ -589,7 +589,7 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): ('', None), ('name', None), ]) - self.assertEqual(b'=&=true&=false&=&name=', result) + self.assertEqual(b'=&=true&=false&&name', result) def test_serialization_using_plusses(self): self.transcoder: transcoders.FormUrlEncodedTranscoder @@ -643,3 +643,9 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): for value in {None, True, False}: with self.assertRaises(TypeError): self.transcoder.to_bytes(value) + + def test_serialization_of_sequences(self): + sequence = [[1, 2, 3], {1, 2, 3}, (1, 2, 3)] + for value in sequence: + with self.assertRaises(TypeError): + self.transcoder.to_bytes(value) From 25ec09972c9bb2b79870b4ec9621b99b80603a71 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Sat, 9 Oct 2021 07:48:44 -0400 Subject: [PATCH 05/11] Add uuid support to FormUrlEncodedTranscoder. --- sprockets/mixins/mediatype/transcoders.py | 2 +- tests.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 07a4d78..866445e 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -405,7 +405,7 @@ class FormUrlEncodedTranscoder: f'{datum.__class__.__name__} is not serializable' ) from None - if isinstance(datum, (float, int, str)): + if isinstance(datum, (float, int, str, uuid.UUID)): datum = str(datum) elif datum is not None and hasattr(datum, 'isoformat'): datum = datum.isoformat() diff --git a/tests.py b/tests.py index 553d3f1..9f5a28a 100644 --- a/tests.py +++ b/tests.py @@ -561,11 +561,13 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): def test_simple_serialization(self): now = datetime.datetime.now() + id_val = uuid.uuid4() content_type, result = self.transcoder.to_bytes({ 'integer': 12, 'float': math.pi, 'string': 'percent quoted', 'datetime': now, + 'id': id_val, }) self.assertEqual(content_type, 'application/x-www-formurlencoded') self.assertEqual( @@ -574,6 +576,7 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): f'float={math.pi}', 'string=percent%20quoted', 'datetime=' + now.isoformat().replace(':', '%3A'), + f'id={id_val}', ])) def test_that_serialization_encoding_can_be_overridden(self): @@ -623,6 +626,7 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): self.assertEqual(expected, result) def test_serialization_of_primitives(self): + id_val = uuid.uuid4() expectations = { None: b'', 'a string': b'a%20string', @@ -632,6 +636,7 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): False: b'false', b'\xfe\xed\xfa\xce': b'%FE%ED%FA%CE', memoryview(b'\xfe\xed\xfa\xce'): b'%FE%ED%FA%CE', + id_val: str(id_val).encode(), } for value, expected in expectations.items(): _, result = self.transcoder.to_bytes(value) From ee66c5cadb3fc260ad82c5b4ae03fa04c6ddcbb8 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Wed, 13 Oct 2021 06:49:07 -0400 Subject: [PATCH 06/11] Optimize the URL encoding inner loop. I rearranged the _encode method branching a bit. This resulted in a little more than a 12% gain. Still slower than JSON or msgpack but it's in python instead of C. --- sprockets/mixins/mediatype/transcoders.py | 36 +++++++++++------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 866445e..6869a66 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -394,25 +394,23 @@ class FormUrlEncodedTranscoder: def _encode(self, datum: typing.Union[bool, None, float, int, str, type_info.DefinesIsoFormat], char_map: typing.Mapping[int, str], encoding: str) -> str: - try: - datum = self.options.literal_mapping[datum] # type: ignore - except (KeyError, TypeError): - if isinstance(datum, (bytearray, bytes, memoryview)): - return ''.join(char_map[c] for c in datum) - - if datum is None or isinstance(datum, bool): - raise TypeError( - f'{datum.__class__.__name__} is not serializable' - ) from None - - if isinstance(datum, (float, int, str, uuid.UUID)): - datum = str(datum) - elif datum is not None and hasattr(datum, 'isoformat'): - datum = datum.isoformat() - else: - raise TypeError( - f'{datum.__class__.__name__} is not serializable' - ) from None + if isinstance(datum, str): + pass # optimization: skip additional checks for strings + elif datum in self.options.literal_mapping: + datum = self.options.literal_mapping[datum] + elif isinstance(datum, (bytearray, bytes, memoryview)): + return ''.join(char_map[c] for c in datum) + elif datum is None or isinstance(datum, bool): + # This could happen if the user modifies the literal mapping + # and MUST be before the isinstance(datum, int) check since + # Boolean literals are integers instances + raise TypeError(f'{datum.__class__.__name__} is not serializable') + elif isinstance(datum, (float, int, str, uuid.UUID)): + datum = str(datum) + elif hasattr(datum, 'isoformat'): + datum = datum.isoformat() + else: + raise TypeError(f'{datum.__class__.__name__} is not serializable') return ''.join(char_map[c] for c in datum.encode(encoding)) From 2df3aade3ca5f2bf89d9b2a115bc2840975e1907 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Thu, 14 Oct 2021 07:03:59 -0400 Subject: [PATCH 07/11] Improve documentation for FormUrlEncodedTranscoder. --- sprockets/mixins/mediatype/transcoders.py | 46 ++++++++++++++++++----- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 6869a66..90b79df 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -274,19 +274,45 @@ class FormUrlEncodedTranscoder: """Opinionated transcoder for the venerable x-www-formurlencoded. This transcoder implements transcoding according to the current - W3C documentation. + W3C documentation. The encoding interface takes mappings or + sequences of pairs and encodes both the name and value. The + following list describes how each supported type is encoded. + Any value type that is not on the list will result in a + :exc:`TypeError`. - * character strings are encoded as UTF-8 codepoints before - percent-encoding the resulting bytes - * the space character is represented as ``%20`` - * :data:`False` is represented as ``false`` - * :data:`True` is represented as ``true`` - * :data:`None` is represented as the empty string - - Some of the opinions can be changed by modifying ``self.options``. + +----------------------------+---------------------------------------+ + | Value / Type | Encoding | + +============================+=======================================+ + | character strings | UTF-8 codepoints before percent- | + | | encoding the resulting bytes | + +----------------------------+---------------------------------------+ + | space character | ``%20`` or ``+`` | + +----------------------------+---------------------------------------+ + | :data:`False` | ``false`` | + +----------------------------+---------------------------------------+ + | :data:`True` | ``true`` | + +----------------------------+---------------------------------------+ + | :data:`None` | the empty string | + +----------------------------+---------------------------------------+ + | numbers | ``str(n)`` | + +----------------------------+---------------------------------------+ + | byte sequences | percent-encoded bytes | + +----------------------------+---------------------------------------+ + | :class:`uuid.UUID` | ``str(u)`` | + +----------------------------+---------------------------------------+ + | :class:`datetime.datetime` | result of calling | + | | :meth:`~datetime.datetime.isoformat` | + +----------------------------+---------------------------------------+ https://url.spec.whatwg.org/#application/x-www-form-urlencoded + .. warning:: + + Types that are not explicitly mentioned above will result in + :meth:`to_bytes` raising a :exc:`TypeError`. This transcoder + differs slightly from others in that it does not include + support for encoding values that are nested collections. + .. attribute:: options :type: FormUrlEncodingOptions @@ -423,4 +449,4 @@ class FormUrlEncodedTranscoder: try: return [(a, b) for a, b in value] # type: ignore except (TypeError, ValueError): - raise TypeError + raise TypeError('Cannot convert value to sequence of tuples') From 1fe22df71944ff8763df930d807e634002355c45 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Thu, 14 Oct 2021 07:06:07 -0400 Subject: [PATCH 08/11] Gracefully handle transcoder encoding failures. Since the form encoder refuses to handle nested sequences, the content mixin explicitly handles this case instead of letting the unhandled exception bubble up. --- docs/history.rst | 2 ++ sprockets/mixins/mediatype/content.py | 17 +++++++++++----- tests.py | 29 +++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index f02c664..4dea9bc 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -9,6 +9,8 @@ Version History and there is no default content type configured - Deprecate not having a default content type configured - Fail gracefully when a transcoder does not exist for the default content type +- Fail gracefully when a transcoder raises a :exc:`TypeError` or :exc:`ValueError` when encoding + the response .. _application/x-www-formurlencoded: https://url.spec.whatwg.org/#application/x-www-form-urlencoded diff --git a/sprockets/mixins/mediatype/content.py b/sprockets/mixins/mediatype/content.py index 2ba01a1..2a27d3f 100644 --- a/sprockets/mixins/mediatype/content.py +++ b/sprockets/mixins/mediatype/content.py @@ -423,8 +423,15 @@ class ContentMixin(web.RequestHandler): settings.default_content_type) raise web.HTTPError(500) else: - content_type, data_bytes = handler.to_bytes(body) - if set_content_type: - self.set_header('Content-Type', content_type) - self.add_header('Vary', 'Accept') - self.write(data_bytes) + try: + content_type, data_bytes = handler.to_bytes(body) + except (TypeError, ValueError) as e: + self._logger.error( + 'selected transcoder (%s) failed to encode response ' + 'body: %s', handler.__class__.__name__, e) + raise web.HTTPError(500, reason='Response Encoding Failure') + else: + if set_content_type: + self.set_header('Content-Type', content_type) + self.add_header('Vary', 'Accept') + self.write(data_bytes) diff --git a/tests.py b/tests.py index 9f5a28a..330afd6 100644 --- a/tests.py +++ b/tests.py @@ -177,6 +177,35 @@ class SendResponseTests(testing.AsyncHTTPTestCase): self.assertEqual('application/foo+json', response.headers.get('Content-Type')) + def test_that_transcoder_failures_result_in_500(self): + class FailingTranscoder: + content_type = 'application/vnd.com.example.bad' + + def __init__(self): + self.exc_class = TypeError + + def to_bytes(self, inst_data, encoding=None): + raise self.exc_class('I always fail at this') + + def from_bytes(self, data_bytes, encoding=None): + return {} + + transcoder = FailingTranscoder() + content.add_transcoder(self.application, transcoder) + for _ in range(2): + response = self.fetch( + '/', + method='POST', + body=b'{}', + headers={ + 'Accept': 'application/vnd.com.example.bad', + 'Content-Type': 'application/json', + }, + ) + self.assertEqual(500, response.code) + self.assertEqual('Response Encoding Failure', response.reason) + transcoder.exc_class = ValueError + class GetRequestBodyTests(testing.AsyncHTTPTestCase): def setUp(self): From 198e73b6eff29538ac433feda12d603da462bd7a Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Thu, 14 Oct 2021 07:51:32 -0400 Subject: [PATCH 09/11] Implement form encoding of sequence values. This is off by default to match the `doseq` parameter of urllib.parse.urlencode. --- sprockets/mixins/mediatype/transcoders.py | 41 +++++++++++++++++------ tests.py | 19 +++++++++-- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index 90b79df..ce24f54 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -258,6 +258,9 @@ class FormUrlEncodingOptions: encoding: str = 'utf-8' """Encoding use when generating the byte stream from character data.""" + encode_sequences: bool = False + """Encode sequence values as multiple name=value instances.""" + literal_mapping: dict[typing.Literal[None, True, False], str] = dataclasses.field(default_factory=lambda: { None: '', @@ -311,7 +314,12 @@ class FormUrlEncodedTranscoder: Types that are not explicitly mentioned above will result in :meth:`to_bytes` raising a :exc:`TypeError`. This transcoder differs slightly from others in that it does not include - support for encoding values that are nested collections. + support for encoding values that are nested collections without + explicit configuration. + + Support for sequence values can be enabled by setting the + :attr:`~FormUrlEncodingOptions.encode_sequences` attribute of + :attr:`.options`. .. attribute:: options :type: FormUrlEncodingOptions @@ -422,6 +430,9 @@ class FormUrlEncodedTranscoder: char_map: typing.Mapping[int, str], encoding: str) -> str: if isinstance(datum, str): pass # optimization: skip additional checks for strings + elif (isinstance(datum, (float, int, str, uuid.UUID)) + and not isinstance(datum, bool)): + datum = str(datum) elif datum in self.options.literal_mapping: datum = self.options.literal_mapping[datum] elif isinstance(datum, (bytearray, bytes, memoryview)): @@ -431,8 +442,6 @@ class FormUrlEncodedTranscoder: # and MUST be before the isinstance(datum, int) check since # Boolean literals are integers instances raise TypeError(f'{datum.__class__.__name__} is not serializable') - elif isinstance(datum, (float, int, str, uuid.UUID)): - datum = str(datum) elif hasattr(datum, 'isoformat'): datum = datum.isoformat() else: @@ -440,13 +449,25 @@ class FormUrlEncodedTranscoder: return ''.join(char_map[c] for c in datum.encode(encoding)) - @staticmethod def _convert_to_tuple_sequence( - value: type_info.Serializable + self, value: type_info.Serializable ) -> typing.Iterable[typing.Tuple[typing.Any, typing.Any]]: if isinstance(value, collections.abc.Mapping): - return value.items() - try: - return [(a, b) for a, b in value] # type: ignore - except (TypeError, ValueError): - raise TypeError('Cannot convert value to sequence of tuples') + tuples = value.items() + else: + try: + tuples = [(a, b) for a, b in value] # type: ignore + except (TypeError, ValueError): + raise TypeError('Cannot convert value to sequence of tuples') + + if self.options.encode_sequences: + tuples, in_tuples = [], tuples + for a, b in in_tuples: + if (not isinstance(b, (bytes, bytearray, memoryview, str)) + and isinstance(b, collections.abc.Iterable)): + for value in b: + tuples.append((a, value)) + else: + tuples.append((a, b)) + + return tuples diff --git a/tests.py b/tests.py index 330afd6..33dd0f5 100644 --- a/tests.py +++ b/tests.py @@ -679,7 +679,22 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): self.transcoder.to_bytes(value) def test_serialization_of_sequences(self): - sequence = [[1, 2, 3], {1, 2, 3}, (1, 2, 3)] - for value in sequence: + self.transcoder: transcoders.FormUrlEncodedTranscoder + + always_illegal = [[1, 2, 3], {1, 2, 3}, (1, 2, 3)] + + self.transcoder.options.encode_sequences = False + for value in always_illegal: with self.assertRaises(TypeError): self.transcoder.to_bytes(value) + + self.transcoder.options.encode_sequences = True + for value in always_illegal: + with self.assertRaises(TypeError): + self.transcoder.to_bytes(value) + + self.transcoder.options.encode_sequences = True + value = {'list': [1, 2], 'tuple': (1, 2), 'set': {1, 2}, 'str': 'val'} + _, result = self.transcoder.to_bytes(value) + self.assertEqual(b'list=1&list=2&tuple=1&tuple=2&set=1&set=2&str=val', + result) From 7f03f29175ce83aca1982b2f507b88fdc27c264d Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Thu, 14 Oct 2021 07:58:04 -0400 Subject: [PATCH 10/11] Accept encoding options in FormUrlEncodingOptions. --- sprockets/mixins/mediatype/transcoders.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index ce24f54..cd66df5 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -276,6 +276,9 @@ class FormUrlEncodingOptions: class FormUrlEncodedTranscoder: """Opinionated transcoder for the venerable x-www-formurlencoded. + :param encoding_options: keyword parameters are used to initialize + :class:`FormUrlEncodingOptions` + This transcoder implements transcoding according to the current W3C documentation. The encoding interface takes mappings or sequences of pairs and encodes both the name and value. The @@ -329,8 +332,8 @@ class FormUrlEncodedTranscoder: """ content_type = 'application/x-www-formurlencoded' - def __init__(self) -> None: - self.options = FormUrlEncodingOptions() + def __init__(self, **encoding_options) -> None: + self.options = FormUrlEncodingOptions(**encoding_options) def to_bytes( self, From 675ffbdf985484574a9a4ffb6ab23909b6bfca84 Mon Sep 17 00:00:00 2001 From: Dave Shawley Date: Fri, 15 Oct 2021 07:18:01 -0400 Subject: [PATCH 11/11] Change form encoder to default to stringify. This matches what urlencode would do and is will remove surprises where a value is supported by the other transcoders but not the form encoder. --- sprockets/mixins/mediatype/transcoders.py | 47 +++++++++++------------ sprockets/mixins/mediatype/type_info.py | 5 ++- tests.py | 28 +++++++------- 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/sprockets/mixins/mediatype/transcoders.py b/sprockets/mixins/mediatype/transcoders.py index cd66df5..879b705 100644 --- a/sprockets/mixins/mediatype/transcoders.py +++ b/sprockets/mixins/mediatype/transcoders.py @@ -282,9 +282,7 @@ class FormUrlEncodedTranscoder: This transcoder implements transcoding according to the current W3C documentation. The encoding interface takes mappings or sequences of pairs and encodes both the name and value. The - following list describes how each supported type is encoded. - Any value type that is not on the list will result in a - :exc:`TypeError`. + following table describes how each supported type is encoded. +----------------------------+---------------------------------------+ | Value / Type | Encoding | @@ -315,14 +313,16 @@ class FormUrlEncodedTranscoder: .. warning:: Types that are not explicitly mentioned above will result in - :meth:`to_bytes` raising a :exc:`TypeError`. This transcoder - differs slightly from others in that it does not include - support for encoding values that are nested collections without - explicit configuration. + :meth:`to_bytes` simply calling ``str(value)`` and encoding + the result. This causes nested sequences to be encoded as + their ``repr``. For example, encoding ``{'a': [1, 2]}`` will + result in ``a=%5B1%2C%202%5D``. This matches what + :func:`urllib.parse.urlencode` does by default. - Support for sequence values can be enabled by setting the - :attr:`~FormUrlEncodingOptions.encode_sequences` attribute of - :attr:`.options`. + Better support for sequence values can be enabled by setting + the :attr:`~FormUrlEncodingOptions.encode_sequences` attribute + of :attr:`.options`. This mimics the ``doseq`` parameter of + :func:`urllib,parse.urlencode`. .. attribute:: options :type: FormUrlEncodingOptions @@ -332,7 +332,7 @@ class FormUrlEncodedTranscoder: """ content_type = 'application/x-www-formurlencoded' - def __init__(self, **encoding_options) -> None: + def __init__(self, **encoding_options: typing.Any) -> None: self.options = FormUrlEncodingOptions(**encoding_options) def to_bytes( @@ -436,25 +436,23 @@ class FormUrlEncodedTranscoder: elif (isinstance(datum, (float, int, str, uuid.UUID)) and not isinstance(datum, bool)): datum = str(datum) - elif datum in self.options.literal_mapping: - datum = self.options.literal_mapping[datum] + elif (isinstance(datum, collections.abc.Hashable) + and datum in self.options.literal_mapping): + # the isinstance Hashable check confuses mypy + datum = self.options.literal_mapping[datum] # type: ignore elif isinstance(datum, (bytearray, bytes, memoryview)): return ''.join(char_map[c] for c in datum) - elif datum is None or isinstance(datum, bool): - # This could happen if the user modifies the literal mapping - # and MUST be before the isinstance(datum, int) check since - # Boolean literals are integers instances - raise TypeError(f'{datum.__class__.__name__} is not serializable') - elif hasattr(datum, 'isoformat'): + elif isinstance(datum, type_info.DefinesIsoFormat): datum = datum.isoformat() else: - raise TypeError(f'{datum.__class__.__name__} is not serializable') + datum = str(datum) return ''.join(char_map[c] for c in datum.encode(encoding)) def _convert_to_tuple_sequence( self, value: type_info.Serializable ) -> typing.Iterable[typing.Tuple[typing.Any, typing.Any]]: + tuples: typing.Iterable[typing.Tuple[typing.Any, typing.Any]] if isinstance(value, collections.abc.Mapping): tuples = value.items() else: @@ -464,13 +462,14 @@ class FormUrlEncodedTranscoder: raise TypeError('Cannot convert value to sequence of tuples') if self.options.encode_sequences: - tuples, in_tuples = [], tuples - for a, b in in_tuples: + out_tuples = [] + for a, b in tuples: if (not isinstance(b, (bytes, bytearray, memoryview, str)) and isinstance(b, collections.abc.Iterable)): for value in b: - tuples.append((a, value)) + out_tuples.append((a, value)) else: - tuples.append((a, b)) + out_tuples.append((a, b)) + tuples = out_tuples return tuples diff --git a/sprockets/mixins/mediatype/type_info.py b/sprockets/mixins/mediatype/type_info.py index 4b63a3a..53e6f22 100644 --- a/sprockets/mixins/mediatype/type_info.py +++ b/sprockets/mixins/mediatype/type_info.py @@ -4,13 +4,14 @@ import typing import uuid try: - from typing import Protocol + from typing import Protocol, runtime_checkable except ImportError: # "ignore" is required to avoid an incompatible import # error due to different bindings of _SpecialForm - from typing_extensions import Protocol # type: ignore + from typing_extensions import Protocol, runtime_checkable # type: ignore +@runtime_checkable class DefinesIsoFormat(Protocol): """An object that has an isoformat method.""" def isoformat(self) -> str: diff --git a/tests.py b/tests.py index 33dd0f5..550bb2d 100644 --- a/tests.py +++ b/tests.py @@ -634,9 +634,13 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): _, result = self.transcoder.to_bytes({'value': 'with space'}) self.assertEqual(b'value=with%20space', result) - def test_that_serializing_unsupported_types_fails(self): - with self.assertRaises(TypeError): - self.transcoder.to_bytes({'unsupported': object()}) + def test_that_serializing_unsupported_types_stringifies(self): + obj = object() + # quick & dirty URL encoding + expected = str(obj).translate({0x20: '%20', 0x3C: '%3C', 0x3E: '%3E'}) + + _, result = self.transcoder.to_bytes({'unsupported': obj}) + self.assertEqual(f'unsupported={expected}'.encode(), result) def test_that_required_octets_are_encoded(self): # build the set of all characters required to be encoded by @@ -675,26 +679,20 @@ class FormUrlEncodingTranscoderTests(unittest.TestCase): self.transcoder: transcoders.FormUrlEncodedTranscoder self.transcoder.options.literal_mapping.clear() for value in {None, True, False}: - with self.assertRaises(TypeError): - self.transcoder.to_bytes(value) + _, result = self.transcoder.to_bytes(value) + self.assertEqual(str(value).encode(), result) def test_serialization_of_sequences(self): self.transcoder: transcoders.FormUrlEncodedTranscoder - always_illegal = [[1, 2, 3], {1, 2, 3}, (1, 2, 3)] + value = {'list': [1, 2], 'tuple': (1, 2), 'set': {1, 2}, 'str': 'val'} self.transcoder.options.encode_sequences = False - for value in always_illegal: - with self.assertRaises(TypeError): - self.transcoder.to_bytes(value) + _, result = self.transcoder.to_bytes(value) + self.assertEqual((b'list=%5B1%2C%202%5D&tuple=%281%2C%202%29' + b'&set=%7B1%2C%202%7D&str=val'), result) self.transcoder.options.encode_sequences = True - for value in always_illegal: - with self.assertRaises(TypeError): - self.transcoder.to_bytes(value) - - self.transcoder.options.encode_sequences = True - value = {'list': [1, 2], 'tuple': (1, 2), 'set': {1, 2}, 'str': 'val'} _, result = self.transcoder.to_bytes(value) self.assertEqual(b'list=1&list=2&tuple=1&tuple=2&set=1&set=2&str=val', result)