From 266184eaf14c9e9b3d8cd40a0ac6595f9694815f Mon Sep 17 00:00:00 2001 From: Elvis Pranskevichus Date: Mon, 2 Jan 2017 17:43:43 -0500 Subject: [PATCH 1/2] Add infrastructure for multiformat data codecs Current asyncpg codec architecture allows for only one codec per data type. This arrangement works most of the time, but fails utterly to handle situations when the text format I/O is forced by the lack of binary I/O support for one of the types in the composite type. This commit is the preparation for the introduction of text format parsing and allows to associate multiple format-specific codecs for each type. --- asyncpg/introspection.py | 49 ++++++---- asyncpg/protocol/codecs/base.pxd | 10 +- asyncpg/protocol/codecs/base.pyx | 148 +++++++++++++++++++---------- asyncpg/protocol/codecs/record.pyx | 3 +- asyncpg/protocol/settings.pxd | 2 +- asyncpg/protocol/settings.pyx | 11 ++- 6 files changed, 149 insertions(+), 74 deletions(-) diff --git a/asyncpg/introspection.py b/asyncpg/introspection.py index 4554629a..a9050d99 100644 --- a/asyncpg/introspection.py +++ b/asyncpg/introspection.py @@ -7,8 +7,8 @@ INTRO_LOOKUP_TYPES = '''\ WITH RECURSIVE typeinfo_tree( - oid, ns, name, kind, basetype, elemtype, range_subtype, - elem_has_bin_input, elem_has_bin_output, attrtypoids, attrnames, depth) + oid, ns, name, kind, basetype, has_bin_io, elemtype, elemdelim, + range_subtype, elem_has_bin_io, attrtypoids, attrnames, depth) AS ( WITH composite_attrs AS ( @@ -58,10 +58,23 @@ ELSE NULL END) AS basetype, + t.typreceive::oid != 0 AND t.typsend::oid != 0 + AS has_bin_io, t.typelem AS elemtype, + elem_t.typdelim AS elemdelim, range_t.rngsubtype AS range_subtype, - elem_t.typreceive::oid != 0 AS elem_has_bin_input, - elem_t.typsend::oid != 0 AS elem_has_bin_output, + (CASE WHEN t.typtype = 'r' THEN + (SELECT + range_elem_t.typreceive::oid != 0 AND + range_elem_t.typsend::oid != 0 + FROM + pg_catalog.pg_type AS range_elem_t + WHERE + range_elem_t.oid = range_t.rngsubtype) + ELSE + elem_t.typreceive::oid != 0 AND + elem_t.typsend::oid != 0 + END) AS elem_has_bin_io, (CASE WHEN t.typtype = 'c' THEN (SELECT ca.typoids FROM composite_attrs AS ca @@ -91,8 +104,8 @@ ) SELECT - ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.elemtype, - ti.range_subtype, ti.elem_has_bin_input, ti.elem_has_bin_output, + ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.has_bin_io, + ti.elemtype, ti.elemdelim, ti.range_subtype, ti.elem_has_bin_io, ti.attrtypoids, ti.attrnames, 0 FROM typeinfo AS ti @@ -102,8 +115,8 @@ UNION ALL SELECT - ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.elemtype, - ti.range_subtype, ti.elem_has_bin_input, ti.elem_has_bin_output, + ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.has_bin_io, + ti.elemtype, ti.elemdelim, ti.range_subtype, ti.elem_has_bin_io, ti.attrtypoids, ti.attrnames, tt.depth + 1 FROM typeinfo ti, @@ -126,8 +139,8 @@ # Prior to 9.2 PostgreSQL did not have range types. INTRO_LOOKUP_TYPES_91 = '''\ WITH RECURSIVE typeinfo_tree( - oid, ns, name, kind, basetype, elemtype, range_subtype, - elem_has_bin_input, elem_has_bin_output, attrtypoids, attrnames, depth) + oid, ns, name, kind, basetype, has_bin_io, elemtype, elemdelim, + range_subtype, elem_has_bin_io, attrtypoids, attrnames, depth) AS ( WITH composite_attrs AS ( @@ -177,10 +190,14 @@ ELSE NULL END) AS basetype, + t.typreceive::oid != 0 AND t.typsend::oid != 0 + AS has_bin_io, t.typelem AS elemtype, + elem_t.typdelim AS elemdelim, NULL::oid AS range_subtype, - elem_t.typreceive::oid != 0 AS elem_has_bin_input, - elem_t.typsend::oid != 0 AS elem_has_bin_output, + elem_t.typreceive::oid != 0 AND + elem_t.typsend::oid != 0 + AS elem_has_bin_io, (CASE WHEN t.typtype = 'c' THEN (SELECT ca.typoids FROM composite_attrs AS ca @@ -207,8 +224,8 @@ ) SELECT - ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.elemtype, - ti.range_subtype, ti.elem_has_bin_input, ti.elem_has_bin_output, + ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.has_bin_io, + ti.elemtype, ti.elemdelim, ti.range_subtype, ti.elem_has_bin_io, ti.attrtypoids, ti.attrnames, 0 FROM typeinfo AS ti @@ -218,8 +235,8 @@ UNION ALL SELECT - ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.elemtype, - ti.range_subtype, ti.elem_has_bin_input, ti.elem_has_bin_output, + ti.oid, ti.ns, ti.name, ti.kind, ti.basetype, ti.has_bin_io, + ti.elemtype, ti.elemdelim, ti.range_subtype, ti.elem_has_bin_io, ti.attrtypoids, ti.attrnames, tt.depth + 1 FROM typeinfo ti, diff --git a/asyncpg/protocol/codecs/base.pxd b/asyncpg/protocol/codecs/base.pxd index a650b0b8..02d2d595 100644 --- a/asyncpg/protocol/codecs/base.pxd +++ b/asyncpg/protocol/codecs/base.pxd @@ -32,6 +32,7 @@ cdef enum CodecType: cdef enum CodecFormat: + PG_FORMAT_ANY = -1 PG_FORMAT_TEXT = 0 PG_FORMAT_BINARY = 1 @@ -55,6 +56,7 @@ cdef class Codec: # arrays Codec element_codec + Py_UCS4 element_delimiter # composite types tuple element_type_oids @@ -70,7 +72,8 @@ cdef class Codec: encode_func c_encoder, decode_func c_decoder, object py_encoder, object py_decoder, Codec element_codec, tuple element_type_oids, - object element_names, list element_codecs) + object element_names, list element_codecs, + Py_UCS4 element_delimiter) cdef encode_scalar(self, ConnectionSettings settings, WriteBuffer buf, object obj) @@ -118,7 +121,8 @@ cdef class Codec: cdef Codec new_array_codec(uint32_t oid, str name, str schema, - Codec element_codec) + Codec element_codec, + Py_UCS4 element_delimiter) @staticmethod cdef Codec new_range_codec(uint32_t oid, @@ -149,4 +153,4 @@ cdef class DataCodecConfig: dict _type_codecs_cache dict _local_type_codecs - cdef inline Codec get_codec(self, uint32_t oid) + cdef inline Codec get_codec(self, uint32_t oid, CodecFormat format) diff --git a/asyncpg/protocol/codecs/base.pyx b/asyncpg/protocol/codecs/base.pyx index 9122faf4..207c05e5 100644 --- a/asyncpg/protocol/codecs/base.pyx +++ b/asyncpg/protocol/codecs/base.pyx @@ -5,7 +5,8 @@ # the Apache 2.0 License: http://www.apache.org/licenses/LICENSE-2.0 -cdef void* codec_map[MAXSUPPORTEDOID + 1] +cdef void* binary_codec_map[MAXSUPPORTEDOID + 1] +cdef void* text_codec_map[MAXSUPPORTEDOID + 1] cdef dict TYPE_CODECS_CACHE = {} cdef dict EXTRA_CODECS = {} @@ -22,7 +23,8 @@ cdef class Codec: encode_func c_encoder, decode_func c_decoder, object py_encoder, object py_decoder, Codec element_codec, tuple element_type_oids, - object element_names, list element_codecs): + object element_names, list element_codecs, + Py_UCS4 element_delimiter): self.name = name self.schema = schema @@ -36,6 +38,7 @@ cdef class Codec: self.element_codec = element_codec self.element_type_oids = element_type_oids self.element_codecs = element_codecs + self.element_delimiter = element_delimiter if element_names is not None: self.element_names = record.ApgRecordDesc_New( @@ -71,7 +74,7 @@ cdef class Codec: self.py_encoder, self.py_decoder, self.element_codec, self.element_type_oids, self.element_names, - self.element_codecs) + self.element_codecs, self.element_delimiter) return codec @@ -241,11 +244,13 @@ cdef class Codec: cdef Codec new_array_codec(uint32_t oid, str name, str schema, - Codec element_codec): + Codec element_codec, + Py_UCS4 element_delimiter): cdef Codec codec codec = Codec(oid) - codec.init(name, schema, 'array', CODEC_ARRAY, PG_FORMAT_BINARY, - NULL, NULL, None, None, element_codec, None, None, None) + codec.init(name, schema, 'array', CODEC_ARRAY, element_codec.format, + NULL, NULL, None, None, element_codec, None, None, None, + element_delimiter) return codec @staticmethod @@ -256,7 +261,8 @@ cdef class Codec: cdef Codec codec codec = Codec(oid) codec.init(name, schema, 'range', CODEC_RANGE, PG_FORMAT_BINARY, - NULL, NULL, None, None, element_codec, None, None, None) + NULL, NULL, None, None, element_codec, None, None, None, + 0) return codec @staticmethod @@ -270,7 +276,7 @@ cdef class Codec: codec = Codec(oid) codec.init(name, schema, 'composite', CODEC_COMPOSITE, PG_FORMAT_BINARY, NULL, NULL, None, None, None, - element_type_oids, element_names, element_codecs) + element_type_oids, element_names, element_codecs, 0) return codec @staticmethod @@ -284,7 +290,7 @@ cdef class Codec: cdef Codec codec codec = Codec(oid) codec.init(name, schema, kind, CODEC_PY, format, NULL, NULL, - encoder, decoder, None, None, None, None) + encoder, decoder, None, None, None, None, 0) return codec @@ -313,11 +319,22 @@ cdef class DataCodecConfig: cdef: Codec elem_codec list comp_elem_codecs + CodecFormat format + CodecFormat elem_format + bint has_text_elements + Py_UCS4 elem_delim for ti in types: oid = ti['oid'] - if self.get_codec(oid) is not None: + if not ti['has_bin_io']: + format = PG_FORMAT_TEXT + else: + format = PG_FORMAT_BINARY + + has_text_elements = False + + if self.get_codec(oid, format) is not None: continue name = ti['name'] @@ -338,14 +355,21 @@ cdef class DataCodecConfig: name = name[1:] name = '{}[]'.format(name) - elem_codec = self.get_codec(array_element_oid) + if ti['elem_has_bin_io']: + elem_format = PG_FORMAT_BINARY + else: + elem_format = PG_FORMAT_TEXT + elem_codec = self.get_codec(array_element_oid, elem_format) if elem_codec is None: raise RuntimeError( 'no codec for array element type {}'.format( array_element_oid)) - self._type_codecs_cache[oid] = \ - Codec.new_array_codec(oid, name, schema, elem_codec) + elem_delim = ti['elemdelim'][0] + + self._type_codecs_cache[oid, elem_format] = \ + Codec.new_array_codec( + oid, name, schema, elem_codec, elem_delim) elif ti['kind'] == b'c': if not comp_type_attrs: @@ -358,7 +382,10 @@ cdef class DataCodecConfig: comp_elem_codecs = [] for typoid in comp_type_attrs: - elem_codec = self.get_codec(typoid) + elem_codec = self.get_codec(typoid, PG_FORMAT_BINARY) + if elem_codec is None: + elem_codec = self.get_codec(typoid, PG_FORMAT_TEXT) + has_text_elements = True if elem_codec is None: raise RuntimeError( 'no codec for composite attribute type {}'.format( @@ -369,7 +396,10 @@ cdef class DataCodecConfig: for i, attrname in enumerate(ti['attrnames']): element_names[attrname] = i - self._type_codecs_cache[oid] = \ + if has_text_elements: + format = PG_FORMAT_TEXT + + self._type_codecs_cache[oid, format] = \ Codec.new_composite_codec( oid, name, schema, comp_elem_codecs, comp_type_attrs, @@ -383,12 +413,12 @@ cdef class DataCodecConfig: 'type record missing base type for domain {}'.format( oid)) - elem_codec = self.get_codec(base_type) + elem_codec = self.get_codec(base_type, format) if elem_codec is None: raise RuntimeError( 'no codec for domain base type {}'.format(base_type)) - self._type_codecs_cache[oid] = elem_codec + self._type_codecs_cache[oid, format] = elem_codec elif ti['kind'] == b'r': # Range type @@ -398,13 +428,17 @@ cdef class DataCodecConfig: 'type record missing base type for range {}'.format( oid)) - elem_codec = self.get_codec(range_subtype_oid) + if ti['elem_has_bin_io']: + elem_format = PG_FORMAT_BINARY + else: + elem_format = PG_FORMAT_TEXT + elem_codec = self.get_codec(range_subtype_oid, elem_format) if elem_codec is None: raise RuntimeError( 'no codec for range element type {}'.format( range_subtype_oid)) - self._type_codecs_cache[oid] = \ + self._type_codecs_cache[oid, elem_format] = \ Codec.new_range_codec(oid, name, schema, elem_codec) else: @@ -429,13 +463,13 @@ cdef class DataCodecConfig: def add_python_codec(self, typeoid, typename, typeschema, typekind, encoder, decoder, binary): - if self.get_codec(typeoid) is not None: + format = PG_FORMAT_BINARY if binary else PG_FORMAT_TEXT + + if self.get_codec(typeoid, format) is not None: raise ValueError('cannot override codec for type {}'.format( typeoid)) - format = PG_FORMAT_BINARY if binary else PG_FORMAT_TEXT - - self._local_type_codecs[typeoid] = \ + self._local_type_codecs[typeoid, format] = \ Codec.new_python_codec(typeoid, typename, typeschema, typekind, encoder, decoder, format) @@ -445,57 +479,65 @@ cdef class DataCodecConfig: Codec codec Codec target_codec - if self.get_codec(typeoid) is not None: - raise ValueError('cannot override codec for type {}'.format( - typeoid)) + for format in (PG_FORMAT_BINARY, PG_FORMAT_TEXT): + if self.get_codec(typeoid, format) is not None: + raise ValueError('cannot override codec for type {}'.format( + typeoid)) - if isinstance(alias_to, int): - target_codec = self.get_codec(alias_to) - else: - target_codec = get_extra_codec(alias_to) + if isinstance(alias_to, int): + target_codec = self.get_codec(alias_to, format) + else: + target_codec = get_extra_codec(alias_to, format) - if target_codec is None: - raise ValueError('unknown alias target: {}'.format(alias_to)) + if target_codec is None: + continue - codec = target_codec.copy() - codec.oid = typeoid - codec.name = typename - codec.schema = typeschema - codec.kind = typekind + codec = target_codec.copy() + codec.oid = typeoid + codec.name = typename + codec.schema = typeschema + codec.kind = typekind - self._local_type_codecs[typeoid] = codec + self._local_type_codecs[typeoid, format] = codec + + if ((typeoid, PG_FORMAT_BINARY) not in self._local_type_codecs and + (typeoid, PG_FORMAT_TEXT) not in self._local_type_codecs): + raise ValueError('unknown alias target: {}'.format(alias_to)) def clear_type_cache(self): self._type_codecs_cache.clear() - cdef inline Codec get_codec(self, uint32_t oid): + cdef inline Codec get_codec(self, uint32_t oid, CodecFormat format): cdef Codec codec - codec = get_core_codec(oid) + codec = get_core_codec(oid, format) if codec is not None: return codec try: - return self._type_codecs_cache[oid] + return self._type_codecs_cache[oid, format] except KeyError: try: - return self._local_type_codecs[oid] + return self._local_type_codecs[oid, format] except KeyError: return None -cdef inline Codec get_core_codec(uint32_t oid): +cdef inline Codec get_core_codec(uint32_t oid, CodecFormat format): cdef void *ptr if oid > MAXSUPPORTEDOID: return None - ptr = codec_map[oid] + if format == PG_FORMAT_BINARY: + ptr = binary_codec_map[oid] + else: + ptr = text_codec_map[oid] if ptr is NULL: return None return ptr cdef inline int has_core_codec(uint32_t oid): - return codec_map[oid] != NULL + return binary_codec_map[oid] != NULL or text_codec_map[oid] != NULL cdef register_core_codec(uint32_t oid, @@ -518,9 +560,13 @@ cdef register_core_codec(uint32_t oid, codec = Codec(oid) codec.init(name, 'pg_catalog', kind, CODEC_C, format, encode, - decode, None, None, None, None, None, None) + decode, None, None, None, None, None, None, 0) cpython.Py_INCREF(codec) # immortalize - codec_map[oid] = codec + + if format == PG_FORMAT_BINARY: + binary_codec_map[oid] = codec + else: + text_codec_map[oid] = codec cdef register_extra_codec(str name, @@ -535,9 +581,9 @@ cdef register_extra_codec(str name, codec = Codec(INVALIDOID) codec.init(name, None, kind, CODEC_C, format, encode, - decode, None, None, None, None, None, None) - EXTRA_CODECS[name] = codec + decode, None, None, None, None, None, None, 0) + EXTRA_CODECS[name, format] = codec -cdef inline Codec get_extra_codec(str name): - return EXTRA_CODECS.get(name) +cdef inline Codec get_extra_codec(str name, CodecFormat format): + return EXTRA_CODECS.get((name, format)) diff --git a/asyncpg/protocol/codecs/record.pyx b/asyncpg/protocol/codecs/record.pyx index 6912f930..291a0b13 100644 --- a/asyncpg/protocol/codecs/record.pyx +++ b/asyncpg/protocol/codecs/record.pyx @@ -37,7 +37,8 @@ cdef anonymous_record_decode(ConnectionSettings settings, FastReadBuffer buf): elem_codec = settings.get_data_codec(elem_typ) if elem_codec is None or not elem_codec.has_decoder(): raise RuntimeError( - 'no decoder for type OID {}'.format(elem_typ)) + 'no decoder for composite type element in ' + 'position {} of type OID {}'.format(i, elem_typ)) elem = elem_codec.decode(settings, elem_buf.slice_from(buf, elem_len)) diff --git a/asyncpg/protocol/settings.pxd b/asyncpg/protocol/settings.pxd index ca9624ee..b50fe14d 100644 --- a/asyncpg/protocol/settings.pxd +++ b/asyncpg/protocol/settings.pxd @@ -22,4 +22,4 @@ cdef class ConnectionSettings: decoder, binary) cpdef inline set_builtin_type_codec( self, typeoid, typename, typeschema, typekind, alias_to) - cpdef inline Codec get_data_codec(self, uint32_t oid) + cpdef inline Codec get_data_codec(self, uint32_t oid, CodecFormat format=*) diff --git a/asyncpg/protocol/settings.pyx b/asyncpg/protocol/settings.pyx index aef492c3..9360c459 100644 --- a/asyncpg/protocol/settings.pyx +++ b/asyncpg/protocol/settings.pyx @@ -42,8 +42,15 @@ cdef class ConnectionSettings: self._data_codecs.set_builtin_type_codec(typeoid, typename, typeschema, typekind, alias_to) - cpdef inline Codec get_data_codec(self, uint32_t oid): - return self._data_codecs.get_codec(oid) + cpdef inline Codec get_data_codec(self, uint32_t oid, + CodecFormat format=PG_FORMAT_ANY): + if format == PG_FORMAT_ANY: + codec = self._data_codecs.get_codec(oid, PG_FORMAT_BINARY) + if codec is None: + codec = self._data_codecs.get_codec(oid, PG_FORMAT_TEXT) + return codec + else: + return self._data_codecs.get_codec(oid, format) def __getattr__(self, name): if not name.startswith('_'): From 78a300912a77c5efc068b9a6b3d696569249fd33 Mon Sep 17 00:00:00 2001 From: Elvis Pranskevichus Date: Mon, 2 Jan 2017 17:51:46 -0500 Subject: [PATCH 2/2] Implement text array decoder Asyncpg now supports decoding arrays of types that do not support binary I/O. --- asyncpg/protocol/codecs/array.pyx | 417 +++++++++++++++++++++++++- asyncpg/protocol/codecs/base.pxd | 3 + asyncpg/protocol/codecs/base.pyx | 11 +- asyncpg/protocol/codecs/textutils.pyx | 72 +++++ asyncpg/protocol/protocol.pyx | 4 +- asyncpg/protocol/python.pxd | 5 + tests/test_codecs.py | 1 - 7 files changed, 508 insertions(+), 5 deletions(-) create mode 100644 asyncpg/protocol/codecs/textutils.pyx diff --git a/asyncpg/protocol/codecs/array.pyx b/asyncpg/protocol/codecs/array.pyx index 1d4044fc..5c13b2e8 100644 --- a/asyncpg/protocol/codecs/array.pyx +++ b/asyncpg/protocol/codecs/array.pyx @@ -10,6 +10,9 @@ from collections.abc import Container as ContainerABC DEF ARRAY_MAXDIM = 6 # defined in postgresql/src/includes/c.h +# "NULL" +cdef Py_UCS4 *APG_NULL = [0x004E, 0x0055, 0x004C, 0x004C, 0x0000] + ctypedef object (*encode_func_ex)(ConnectionSettings settings, WriteBuffer buf, @@ -146,7 +149,7 @@ cdef inline array_decode(ConnectionSettings settings, FastReadBuffer buf, list result int i int32_t elem_len - int64_t elem_count = 1 + int32_t elem_count = 1 FastReadBuffer elem_buf = FastReadBuffer.new() int32_t dims[ARRAY_MAXDIM] Codec elem_codec @@ -173,12 +176,12 @@ cdef inline array_decode(ConnectionSettings settings, FastReadBuffer buf, for i in range(ndims): dims[i] = hton.unpack_int32(buf.read(4)) - elem_count *= dims[i] # Ignore the lower bound information buf.read(4) if ndims == 1: # Fast path for flat arrays + elem_count = dims[0] result = cpython.PyList_New(elem_count) for i in range(elem_count): @@ -312,6 +315,416 @@ cdef inline _nested_array_decode(ConnectionSettings settings, return result +cdef textarray_decode(ConnectionSettings settings, FastReadBuffer buf, + decode_func_ex decoder, const void *decoder_arg, + Py_UCS4 typdelim): + cdef: + Py_UCS4 *array_text + str s + + # Make a copy of array data since we will be mutating it for + # the purposes of element decoding. + s = text_decode(settings, buf) + array_text = PyUnicode_AsUCS4Copy(s) + + try: + return _textarray_decode( + settings, array_text, decoder, decoder_arg, typdelim) + except ValueError as e: + raise ValueError( + 'malformed array literal {!r}: {}'.format(s, e.args[0])) + finally: + PyMem_Free(array_text) + + +cdef _textarray_decode(ConnectionSettings settings, + Py_UCS4 *array_text, + decode_func_ex decoder, + const void *decoder_arg, + Py_UCS4 typdelim): + + cdef: + bytearray array_bytes + list result + list new_stride + Py_UCS4 *ptr + int32_t ndims = 0 + int32_t ubound = 0 + int32_t lbound = 0 + int32_t dims[ARRAY_MAXDIM] + int32_t inferred_dims[ARRAY_MAXDIM] + int32_t inferred_ndims = 0 + void *strides[ARRAY_MAXDIM] + int32_t indexes[ARRAY_MAXDIM] + int32_t nest_level = 0 + int32_t item_level = 0 + bint end_of_array = False + + bint end_of_item = False + bint has_quoting = False + bint strip_spaces = False + bint in_quotes = False + Py_UCS4 *item_start + Py_UCS4 *item_ptr + Py_UCS4 *item_end + + int i + object item + str item_text + FastReadBuffer item_buf = FastReadBuffer.new() + char *pg_item_str + ssize_t pg_item_len + + ptr = array_text + + while True: + while apg_ascii_isspace(ptr[0]): + ptr += 1 + + if ptr[0] != '[': + # Finished parsing dimensions spec. + break + + ptr += 1 # '[' + + if ndims > ARRAY_MAXDIM: + raise ValueError( + 'number of array dimensions ({}) exceed the ' + 'maximum expected ({})'.format(ndims, ARRAY_MAXDIM)) + + ptr = apg_parse_int32(ptr, &ubound) + if ptr == NULL: + raise ValueError('missing array dimension value') + + if ptr[0] == ':': + ptr += 1 + lbound = ubound + + # [lower:upper] spec. We disregard the lbound for decoding. + ptr = apg_parse_int32(ptr, &ubound) + if ptr == NULL: + raise ValueError('missing array dimension value') + else: + lbound = 1 + + if ptr[0] != ']': + raise ValueError('missing \']\' after array dimensions') + + ptr += 1 # ']' + + dims[ndims] = ubound - lbound + 1 + ndims += 1 + + if ndims != 0: + # If dimensions were given, the '=' token is expected. + if ptr[0] != '=': + raise ValueError('missing \'=\' after array dimensions') + + ptr += 1 # '=' + + # Skip any whitespace after the '=', whitespace + # before was consumed in the above loop. + while apg_ascii_isspace(ptr[0]): + ptr += 1 + + # Infer the dimensions from the brace structure in the + # array literal body, and check that it matches the explicit + # spec. This also validates that the array literal is sane. + _infer_array_dims(ptr, typdelim, inferred_dims, &inferred_ndims) + + if inferred_ndims != ndims: + raise ValueError( + 'specified array dimensions do not match array content') + + for i in range(ndims): + if inferred_dims[i] != dims[i]: + raise ValueError( + 'specified array dimensions do not match array content') + else: + # Infer the dimensions from the brace structure in the array literal + # body. This also validates that the array literal is sane. + _infer_array_dims(ptr, typdelim, dims, &ndims) + + while not end_of_array: + # We iterate over the literal character by character + # and modify the string in-place removing the array-specific + # quoting and determining the boundaries of each element. + end_of_item = has_quoting = in_quotes = False + strip_spaces = True + + # Pointers to array element start, end, and the current pointer + # tracking the position where characters are written when + # escaping is folded. + item_start = item_end = item_ptr = ptr + item_level = 0 + + while not end_of_item: + if ptr[0] == '"': + in_quotes = not in_quotes + if in_quotes: + strip_spaces = False + else: + item_end = item_ptr + has_quoting = True + + elif ptr[0] == '\\': + # Quoted character, collapse the backslash. + ptr += 1 + has_quoting = True + item_ptr[0] = ptr[0] + item_ptr += 1 + strip_spaces = False + item_end = item_ptr + + elif in_quotes: + # Consume the string until we see the closing quote. + item_ptr[0] = ptr[0] + item_ptr += 1 + + elif ptr[0] == '{': + # Nesting level increase. + nest_level += 1 + + indexes[nest_level - 1] = 0 + new_stride = cpython.PyList_New(dims[nest_level - 1]) + strides[nest_level - 1] = \ + (new_stride) + + if nest_level > 1: + cpython.Py_INCREF(new_stride) + cpython.PyList_SET_ITEM( + strides[nest_level - 2], + indexes[nest_level - 2], + new_stride) + else: + result = new_stride + + elif ptr[0] == '}': + if item_level == 0: + # Make sure we keep track of which nesting + # level the item belongs to, as the loop + # will continue to consume closing braces + # until the delimiter or the end of input. + item_level = nest_level + + nest_level -= 1 + + if nest_level == 0: + end_of_array = end_of_item = True + + elif ptr[0] == typdelim: + # Array element delimiter, + end_of_item = True + if item_level == 0: + item_level = nest_level + + elif apg_ascii_isspace(ptr[0]): + if not strip_spaces: + item_ptr[0] = ptr[0] + item_ptr += 1 + # Ignore the leading literal whitespace. + + else: + item_ptr[0] = ptr[0] + item_ptr += 1 + strip_spaces = False + item_end = item_ptr + + ptr += 1 + + # end while not end_of_item + + if item_end == item_start: + # Empty array + continue + + item_end[0] = '\0' + + if not has_quoting and apg_strcasecmp(item_start, APG_NULL) == 0: + # NULL element. + item = None + else: + # XXX: find a way to avoid the redundant encode/decode + # cycle here. + item_text = PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, + item_start, + item_end - item_start) + + # Prepare the element buffer and call the text decoder + # for the element type. + as_pg_string_and_size( + settings, item_text, &pg_item_str, &pg_item_len) + item_buf.buf = pg_item_str + item_buf.len = pg_item_len + item = decoder(settings, item_buf, decoder_arg) + + # Place the decoded element in the array. + cpython.Py_INCREF(item) + cpython.PyList_SET_ITEM( + strides[item_level - 1], + indexes[item_level - 1], + item) + + indexes[nest_level - 1] += 1 + + return result + + +cdef enum _ArrayParseState: + APS_START = 1 + APS_STRIDE_STARTED = 2 + APS_STRIDE_DONE = 3 + APS_STRIDE_DELIMITED = 4 + APS_ELEM_STARTED = 5 + APS_ELEM_DELIMITED = 6 + + +cdef _UnexpectedCharacter(const Py_UCS4 *array_text, const Py_UCS4 *ptr): + return ValueError('unexpected character {!r} at position {}'.format( + cpython.PyUnicode_FromOrdinal(ptr[0]), ptr - array_text + 1)) + + +cdef _infer_array_dims(const Py_UCS4 *array_text, + Py_UCS4 typdelim, + int32_t *dims, + int32_t *ndims): + cdef: + const Py_UCS4 *ptr = array_text + int i + int nest_level = 0 + bint end_of_array = False + bint end_of_item = False + bint in_quotes = False + bint array_is_empty = True + int stride_len[ARRAY_MAXDIM] + int prev_stride_len[ARRAY_MAXDIM] + _ArrayParseState parse_state = APS_START + + for i in range(ARRAY_MAXDIM): + dims[i] = prev_stride_len[i] = 0 + stride_len[i] = 1 + + while not end_of_array: + end_of_item = False + + while not end_of_item: + if ptr[0] == '\0': + raise ValueError('unexpected end of string') + + elif ptr[0] == '"': + if (parse_state not in (APS_STRIDE_STARTED, + APS_ELEM_DELIMITED) and + not (parse_state == APS_ELEM_STARTED and in_quotes)): + raise _UnexpectedCharacter(array_text, ptr) + + in_quotes = not in_quotes + if in_quotes: + parse_state = APS_ELEM_STARTED + array_is_empty = False + + elif ptr[0] == '\\': + if parse_state not in (APS_STRIDE_STARTED, + APS_ELEM_STARTED, + APS_ELEM_DELIMITED): + raise _UnexpectedCharacter(array_text, ptr) + + parse_state = APS_ELEM_STARTED + array_is_empty = False + + if ptr[1] != '\0': + ptr += 1 + else: + raise ValueError('unexpected end of string') + + elif in_quotes: + # Ignore everything inside the quotes. + pass + + elif ptr[0] == '{': + if parse_state not in (APS_START, + APS_STRIDE_STARTED, + APS_STRIDE_DELIMITED): + raise _UnexpectedCharacter(array_text, ptr) + + parse_state = APS_STRIDE_STARTED + if nest_level >= ARRAY_MAXDIM: + raise ValueError( + 'number of array dimensions ({}) exceed the ' + 'maximum expected ({})'.format( + nest_level, ARRAY_MAXDIM)) + + dims[nest_level] = 0 + nest_level += 1 + if ndims[0] < nest_level: + ndims[0] = nest_level + + elif ptr[0] == '}': + if (parse_state not in (APS_ELEM_STARTED, APS_STRIDE_DONE) and + not (nest_level == 1 and + parse_state == APS_STRIDE_STARTED)): + raise _UnexpectedCharacter(array_text, ptr) + + parse_state = APS_STRIDE_DONE + + if nest_level == 0: + raise _UnexpectedCharacter(array_text, ptr) + + nest_level -= 1 + + if (prev_stride_len[nest_level] != 0 and + stride_len[nest_level] != prev_stride_len[nest_level]): + raise ValueError( + 'inconsistent sub-array dimensions' + ' at position {}'.format( + ptr - array_text + 1)) + + prev_stride_len[nest_level] = stride_len[nest_level] + stride_len[nest_level] = 1 + if nest_level == 0: + end_of_array = end_of_item = True + else: + dims[nest_level - 1] += 1 + + elif ptr[0] == typdelim: + if parse_state not in (APS_ELEM_STARTED, APS_STRIDE_DONE): + raise _UnexpectedCharacter(array_text, ptr) + + if parse_state == APS_STRIDE_DONE: + parse_state = APS_STRIDE_DELIMITED + else: + parse_state = APS_ELEM_DELIMITED + end_of_item = True + stride_len[nest_level - 1] += 1 + + elif not apg_ascii_isspace(ptr[0]): + if parse_state not in (APS_STRIDE_STARTED, + APS_ELEM_STARTED, + APS_ELEM_DELIMITED): + raise _UnexpectedCharacter(array_text, ptr) + + parse_state = APS_ELEM_STARTED + array_is_empty = False + + if not end_of_item: + ptr += 1 + + if not array_is_empty: + dims[ndims[0] - 1] += 1 + + ptr += 1 + + # only whitespace is allowed after the closing brace + while ptr[0] != '\0': + if not apg_ascii_isspace(ptr[0]): + raise _UnexpectedCharacter(array_text, ptr) + + ptr += 1 + + if array_is_empty: + ndims[0] = 0 + + cdef int4_encode_ex(ConnectionSettings settings, WriteBuffer buf, object obj, const void *arg): return int4_encode(settings, buf, obj) diff --git a/asyncpg/protocol/codecs/base.pxd b/asyncpg/protocol/codecs/base.pxd index 02d2d595..fe5d7b01 100644 --- a/asyncpg/protocol/codecs/base.pxd +++ b/asyncpg/protocol/codecs/base.pxd @@ -94,6 +94,9 @@ cdef class Codec: cdef decode_array(self, ConnectionSettings settings, FastReadBuffer buf) + cdef decode_array_text(self, ConnectionSettings settings, + FastReadBuffer buf) + cdef decode_range(self, ConnectionSettings settings, FastReadBuffer buf) cdef decode_composite(self, ConnectionSettings settings, diff --git a/asyncpg/protocol/codecs/base.pyx b/asyncpg/protocol/codecs/base.pyx index 207c05e5..2da74249 100644 --- a/asyncpg/protocol/codecs/base.pyx +++ b/asyncpg/protocol/codecs/base.pyx @@ -51,7 +51,10 @@ cdef class Codec: self.decoder = &self.decode_scalar elif type == CODEC_ARRAY: self.encoder = &self.encode_array - self.decoder = &self.decode_array + if format == PG_FORMAT_BINARY: + self.decoder = &self.decode_array + else: + self.decoder = &self.decode_array_text elif type == CODEC_RANGE: self.encoder = &self.encode_range self.decoder = &self.decode_range @@ -137,6 +140,12 @@ cdef class Codec: return array_decode(settings, buf, codec_decode_func_ex, (self.element_codec)) + cdef decode_array_text(self, ConnectionSettings settings, + FastReadBuffer buf): + return textarray_decode(settings, buf, codec_decode_func_ex, + (self.element_codec), + self.element_delimiter) + cdef decode_range(self, ConnectionSettings settings, FastReadBuffer buf): return range_decode(settings, buf, codec_decode_func_ex, (self.element_codec)) diff --git a/asyncpg/protocol/codecs/textutils.pyx b/asyncpg/protocol/codecs/textutils.pyx new file mode 100644 index 00000000..1a09c179 --- /dev/null +++ b/asyncpg/protocol/codecs/textutils.pyx @@ -0,0 +1,72 @@ +# Copyright (C) 2016-present the ayncpg authors and contributors +# +# +# This module is part of asyncpg and is released under +# the Apache 2.0 License: http://www.apache.org/licenses/LICENSE-2.0 + + + +cdef int apg_strcasecmp(const Py_UCS4 *s1, const Py_UCS4 *s2): + cdef: + uint32_t c1 + uint32_t c2 + int i = 0 + + while True: + c1 = s1[i] + c2 = s2[i] + + if c1 != c2: + if c1 >= 'A' and c1 <= 'Z': + c1 += 'a' - 'A' + if c2 >= 'A' and c2 <= 'Z': + c2 += 'a' - 'A' + + if c1 != c2: + return c1 - c2 + + if c1 == 0 or c2 == 0: + break + + i += 1 + + return 0 + + +cdef inline bint apg_ascii_isspace(Py_UCS4 ch): + return ( + ch == ' ' or + ch == '\n' or + ch == '\r' or + ch == '\t' or + ch == '\v' or + ch == '\f' + ) + + +cdef Py_UCS4 *apg_parse_int32(Py_UCS4 *buf, int32_t *num): + cdef: + Py_UCS4 *p + int32_t n = 0 + int32_t neg = 0 + + if buf[0] == '-': + neg = 1 + buf += 1 + elif buf[0] == '+': + buf += 1 + + p = buf + while p[0] >= '0' and p[0] <= '9': + n = 10 * n - (p[0] - '0') + p += 1 + + if p == buf: + return NULL + + if not neg: + n = -n + + num[0] = n + + return p diff --git a/asyncpg/protocol/protocol.pyx b/asyncpg/protocol/protocol.pyx index 51bbf9d9..6fdad835 100644 --- a/asyncpg/protocol/protocol.pyx +++ b/asyncpg/protocol/protocol.pyx @@ -24,7 +24,8 @@ from asyncpg.protocol.python cimport ( PyMem_Malloc, PyMem_Realloc, PyMem_Calloc, PyMem_Free, PyMemoryView_GET_BUFFER, PyMemoryView_Check, PyUnicode_AsUTF8AndSize, PyByteArray_AsString, - PyByteArray_Check) + PyByteArray_Check, PyUnicode_AsUCS4Copy, + PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND) from cpython cimport PyBuffer_FillInfo, PyBytes_AsString @@ -42,6 +43,7 @@ include "settings.pyx" include "buffer.pyx" include "codecs/base.pyx" +include "codecs/textutils.pyx" # String types. Need to go first, as other codecs may rely on # text decoding/encoding. diff --git a/asyncpg/protocol/python.pxd b/asyncpg/protocol/python.pxd index ea71b8fe..869eed3e 100644 --- a/asyncpg/protocol/python.pxd +++ b/asyncpg/protocol/python.pxd @@ -20,3 +20,8 @@ cdef extern from "Python.h": char* PyUnicode_AsUTF8AndSize(object unicode, ssize_t *size) except NULL char* PyByteArray_AsString(object) + Py_UCS4* PyUnicode_AsUCS4Copy(object) + object PyUnicode_FromKindAndData( + int kind, const void *buffer, Py_ssize_t size) + + int PyUnicode_4BYTE_KIND diff --git a/tests/test_codecs.py b/tests/test_codecs.py index aa7e9028..070bec8d 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -917,7 +917,6 @@ async def test_table_as_composite(self): DROP TABLE tab; ''') - @unittest.expectedFailure async def test_relacl_array_type(self): await self.con.execute(r''' CREATE USER """u1'";