From b6847a7e71a05a366f8227af7c14b3c9427b4c8c Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Fri, 22 Dec 2023 15:27:54 -0800 Subject: [PATCH 1/7] support PG array dimensionality --- recap/clients/postgresql.py | 23 ++++++++ recap/converters/postgresql.py | 48 +++++---------- tests/integration/clients/test_postgresql.py | 61 ++++++++++++++------ tests/unit/converters/test_postgresql.py | 25 ++++++-- 4 files changed, 101 insertions(+), 56 deletions(-) diff --git a/recap/clients/postgresql.py b/recap/clients/postgresql.py index 0427193..9b079d3 100644 --- a/recap/clients/postgresql.py +++ b/recap/clients/postgresql.py @@ -5,6 +5,7 @@ from recap.clients.dbapi import Connection, DbapiClient from recap.converters.postgresql import PostgresqlConverter +from recap.types import StructType PSYCOPG2_CONNECT_ARGS = { "host", @@ -78,3 +79,25 @@ def ls_catalogs(self) -> list[str]: """ ) return [row[0] for row in cursor.fetchall()] + + def schema(self, catalog: str, schema: str, table: str) -> StructType: + cursor = self.connection.cursor() + cursor.execute( + f""" + SELECT + information_schema.columns.*, + pg_attribute.attndims + FROM information_schema.columns + JOIN pg_attribute on information_schema.columns.column_name = pg_attribute.attname + WHERE table_name = {self.param_style} + AND table_schema = {self.param_style} + AND table_catalog = {self.param_style} + ORDER BY ordinal_position ASC + """, + (table, schema, catalog), + ) + names = [name[0].upper() for name in cursor.description] + return self.converter.to_recap( + # Make each row be a dict with the column names as keys + [dict(zip(names, row)) for row in cursor.fetchall()] + ) diff --git a/recap/converters/postgresql.py b/recap/converters/postgresql.py index 70d1dbc..a09da9f 100644 --- a/recap/converters/postgresql.py +++ b/recap/converters/postgresql.py @@ -8,32 +8,22 @@ FloatType, IntType, ListType, - ProxyType, + NullType, RecapType, - RecapTypeRegistry, StringType, UnionType, ) MAX_FIELD_SIZE = 1073741824 -DEFAULT_NAMESPACE = "_root" -""" -Namespace to use when no namespace is specified in the schema. -""" - class PostgresqlConverter(DbapiConverter): - def __init__(self, namespace: str = DEFAULT_NAMESPACE) -> None: - self.namespace = namespace - self.registry = RecapTypeRegistry() - def _parse_type(self, column_props: dict[str, Any]) -> RecapType: - column_name = column_props["COLUMN_NAME"] data_type = column_props["DATA_TYPE"].lower() octet_length = column_props["CHARACTER_OCTET_LENGTH"] max_length = column_props["CHARACTER_MAXIMUM_LENGTH"] udt_name = (column_props["UDT_NAME"] or "").lower() + ndims = column_props["ATTNDIMS"] if data_type in ["bigint", "int8", "bigserial", "serial8"]: base_type = IntType(bits=64, signed=True) @@ -90,7 +80,6 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: # lengths, etc. Thus, we only set DATA_TYPE here. Sigh. value_type = self._parse_type( { - "COLUMN_NAME": None, "DATA_TYPE": nested_data_type, # Default strings, bits, etc. to the max field size since # information_schema doesn't contain lengths for array @@ -102,29 +91,22 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: # * 8 because bit columns use bits not bytes. "CHARACTER_MAXIMUM_LENGTH": MAX_FIELD_SIZE * 8, "UDT_NAME": None, + "ATTNDIMS": 0, } ) - column_name_without_periods = column_name.replace(".", "_") - base_type_alias = f"{self.namespace}.{column_name_without_periods}" - # Construct a self-referencing list comprised of the array's value - # type and a proxy to the list itself. This allows arrays to be an - # arbitrary number of dimensions, which is how PostgreSQL treats - # lists. See https://github.com/recap-build/recap/issues/264 for - # more details. - base_type = ListType( - alias=base_type_alias, - values=UnionType( - types=[ - value_type, - ProxyType( - alias=base_type_alias, - registry=self.registry, - ), - ], - ), - ) - self.registry.register_alias(base_type) + base_type = self._create_n_dimension_list(value_type, ndims) else: raise ValueError(f"Unknown data type: {data_type}") return base_type + + def _create_n_dimension_list(self, base_type: RecapType, ndims: int) -> RecapType: + """ + Build a list type with `ndims` dimensions containing nullable `base_type` as the innermost value type. + """ + if ndims == 0: + return UnionType(types=[NullType(), base_type]) + else: + return ListType( + values=self._create_n_dimension_list(base_type, ndims - 1), + ) diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index 6668e3c..7e2be5e 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -10,7 +10,6 @@ IntType, ListType, NullType, - ProxyType, StringType, StructType, UnionType, @@ -51,7 +50,9 @@ def setup_class(cls): test_default INTEGER DEFAULT 2, test_int_array INTEGER[], test_varchar_array VARCHAR(255)[] DEFAULT '{"Hello", "World"}', - test_bit_array BIT(8)[] + test_bit_array BIT(8)[], + test_int_array_2d INTEGER[][], + test_text_array_3d TEXT[][][] ); """ ) @@ -164,14 +165,10 @@ def test_struct_method(self): types=[ NullType(), ListType( - alias="_root.test_int_array", values=UnionType( types=[ + NullType(), IntType(bits=32), - ProxyType( - alias="_root.test_int_array", - registry=client.converter.registry, # type: ignore - ), ] ), ), @@ -183,14 +180,10 @@ def test_struct_method(self): types=[ NullType(), ListType( - alias="_root.test_varchar_array", values=UnionType( types=[ + NullType(), StringType(bytes_=MAX_FIELD_SIZE), - ProxyType( - alias="_root.test_varchar_array", - registry=client.converter.registry, # type: ignore - ), ] ), ), @@ -202,19 +195,53 @@ def test_struct_method(self): types=[ NullType(), ListType( - alias="_root.test_bit_array", values=UnionType( types=[ + NullType(), BytesType(bytes_=MAX_FIELD_SIZE, variable=False), - ProxyType( - alias="_root.test_bit_array", - registry=client.converter.registry, # type: ignore - ), ] ), ), ], ), + UnionType( + default=None, + name="test_int_array_2d", + types=[ + NullType(), + ListType( + values=ListType( + values=UnionType( + types=[ + NullType(), + IntType(bits=32), + ] + ) + ), + ), + ], + ), + UnionType( + default=None, + name="test_text_array_3d", + types=[ + NullType(), + ListType( + values=ListType( + values=ListType( + values=UnionType( + types=[ + NullType(), + StringType( + bytes_=MAX_FIELD_SIZE, variable=True + ), + ] + ) + ) + ), + ), + ], + ), ] # Going field by field to make debugging easier when test fails diff --git a/tests/unit/converters/test_postgresql.py b/tests/unit/converters/test_postgresql.py index 5e7bf3b..db55238 100644 --- a/tests/unit/converters/test_postgresql.py +++ b/tests/unit/converters/test_postgresql.py @@ -7,7 +7,7 @@ FloatType, IntType, ListType, - ProxyType, + NullType, StringType, UnionType, ) @@ -25,6 +25,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, IntType(bits=64, signed=True), ), @@ -37,6 +38,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, IntType(bits=32, signed=True), ), @@ -49,6 +51,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, IntType(bits=16, signed=True), ), @@ -61,6 +64,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, FloatType(bits=64), ), @@ -73,6 +77,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, FloatType(bits=32), ), @@ -85,6 +90,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, BoolType(), ), @@ -97,6 +103,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, StringType(bytes_=65536, variable=True), ), @@ -109,6 +116,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, StringType(bytes_=255, variable=True), ), @@ -121,6 +129,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, StringType(bytes_=255, variable=False), ), @@ -133,6 +142,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, BytesType(bytes_=MAX_FIELD_SIZE), ), @@ -145,6 +155,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, BytesType(bytes_=1, variable=False), ), @@ -157,6 +168,7 @@ "NUMERIC_PRECISION": None, "NUMERIC_SCALE": None, "UDT_NAME": None, + "ATTNDIMS": 0, }, BytesType(bytes_=3, variable=False), ), @@ -170,6 +182,7 @@ "NUMERIC_SCALE": None, "UDT_NAME": None, "DATETIME_PRECISION": 3, + "ATTNDIMS": 0, }, IntType(bits=64, logical="build.recap.Timestamp", unit="millisecond"), ), @@ -183,6 +196,7 @@ "NUMERIC_SCALE": None, "UDT_NAME": None, "DATETIME_PRECISION": 3, + "ATTNDIMS": 0, }, IntType(bits=64, logical="build.recap.Timestamp", unit="millisecond"), ), @@ -195,6 +209,7 @@ "NUMERIC_PRECISION": 10, "NUMERIC_SCALE": 2, "UDT_NAME": None, + "ATTNDIMS": 0, }, BytesType( logical="build.recap.Decimal", @@ -213,6 +228,7 @@ "NUMERIC_PRECISION": 5, "NUMERIC_SCALE": 0, "UDT_NAME": None, + "ATTNDIMS": 0, }, BytesType( logical="build.recap.Decimal", @@ -239,16 +255,13 @@ def test_postgresql_converter_array(): "NUMERIC_PRECISION": 5, "NUMERIC_SCALE": 0, "UDT_NAME": "_int4", + "ATTNDIMS": 1, } expected = ListType( - alias="_root.test_column", values=UnionType( types=[ + NullType(), IntType(bits=32, signed=True), - ProxyType( - alias="_root.test_column", - registry=converter.registry, - ), ], ), ) From dbf2a914191550d339be61a318e42f613ead030b Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Sun, 24 Dec 2023 14:31:51 -0800 Subject: [PATCH 2/7] add in other necessary joins --- recap/clients/postgresql.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/recap/clients/postgresql.py b/recap/clients/postgresql.py index 9b079d3..35fa68a 100644 --- a/recap/clients/postgresql.py +++ b/recap/clients/postgresql.py @@ -88,7 +88,14 @@ def schema(self, catalog: str, schema: str, table: str) -> StructType: information_schema.columns.*, pg_attribute.attndims FROM information_schema.columns - JOIN pg_attribute on information_schema.columns.column_name = pg_attribute.attname + JOIN pg_catalog.pg_namespace + ON pg_catalog.pg_namespace.nspname = information_schema.columns.table_schema + JOIN pg_catalog.pg_class + ON pg_catalog.pg_class.relname = information_schema.columns.table_name + AND pg_catalog.pg_class.relnamespace = pg_catalog.pg_namespace.oid + JOIN pg_catalog.pg_attribute + ON pg_catalog.pg_attribute.attrelid = pg_catalog.pg_class.oid + AND pg_catalog.pg_attribute.attname = information_schema.columns.column_name WHERE table_name = {self.param_style} AND table_schema = {self.param_style} AND table_catalog = {self.param_style} From e9e13e81be0df5bb74148878ed106801506237f5 Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Sun, 24 Dec 2023 14:37:30 -0800 Subject: [PATCH 3/7] add not-null array test --- tests/integration/clients/test_postgresql.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index 7e2be5e..3fd3e8a 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -51,6 +51,7 @@ def setup_class(cls): test_int_array INTEGER[], test_varchar_array VARCHAR(255)[] DEFAULT '{"Hello", "World"}', test_bit_array BIT(8)[], + test_not_null_array INTEGER[] NOT NULL, test_int_array_2d INTEGER[][], test_text_array_3d TEXT[][][] ); @@ -204,6 +205,15 @@ def test_struct_method(self): ), ], ), + ListType( + name="test_not_null_array", + values=UnionType( + types=[ + NullType(), + IntType(bits=32), + ] + ), + ), UnionType( default=None, name="test_int_array_2d", From 183ee64bc626931a9d427059357617fd77e48261 Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Mon, 25 Dec 2023 18:42:33 -0800 Subject: [PATCH 4/7] add psql converter init param ignore_array_dimensionality to configure how the reader interprets the number of dimensions in an array --- recap/clients/postgresql.py | 4 +- recap/converters/postgresql.py | 42 +++- tests/integration/clients/test_postgresql.py | 227 ++++++++++++++++++- tests/unit/converters/test_postgresql.py | 33 ++- 4 files changed, 287 insertions(+), 19 deletions(-) diff --git a/recap/clients/postgresql.py b/recap/clients/postgresql.py index 35fa68a..be6f950 100644 --- a/recap/clients/postgresql.py +++ b/recap/clients/postgresql.py @@ -49,8 +49,8 @@ class PostgresqlClient(DbapiClient): - def __init__(self, connection: Connection) -> None: - super().__init__(connection, PostgresqlConverter()) + def __init__(self, connection: Connection, converter = PostgresqlConverter()) -> None: + super().__init__(connection, converter) @staticmethod @contextmanager diff --git a/recap/converters/postgresql.py b/recap/converters/postgresql.py index a09da9f..2f0eced 100644 --- a/recap/converters/postgresql.py +++ b/recap/converters/postgresql.py @@ -1,5 +1,5 @@ from math import ceil -from typing import Any +from typing import Any, Union from recap.converters.dbapi import DbapiConverter from recap.types import ( @@ -9,16 +9,31 @@ IntType, ListType, NullType, + ProxyType, RecapType, + RecapTypeRegistry, StringType, UnionType, ) MAX_FIELD_SIZE = 1073741824 +DEFAULT_NAMESPACE = "_root" +""" +Namespace to use when no namespace is specified in the schema. +""" class PostgresqlConverter(DbapiConverter): + def __init__(self, ignore_array_dimensionality = True, namespace: str = DEFAULT_NAMESPACE): + # since array dimensionality is not enforced by PG schemas: + # if `ignore_array_dimensionality = True` then read arrays irrespective of how many dimensions they have + # if `ignore_array_dimensionality = False` then read arrays as nested lists + self.ignore_array_dimensionality = ignore_array_dimensionality + self.namespace = namespace + self.registry = RecapTypeRegistry() + def _parse_type(self, column_props: dict[str, Any]) -> RecapType: + column_name = column_props["COLUMN_NAME"] data_type = column_props["DATA_TYPE"].lower() octet_length = column_props["CHARACTER_OCTET_LENGTH"] max_length = column_props["CHARACTER_MAXIMUM_LENGTH"] @@ -80,6 +95,7 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: # lengths, etc. Thus, we only set DATA_TYPE here. Sigh. value_type = self._parse_type( { + "COLUMN_NAME": None, "DATA_TYPE": nested_data_type, # Default strings, bits, etc. to the max field size since # information_schema doesn't contain lengths for array @@ -94,7 +110,29 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: "ATTNDIMS": 0, } ) - base_type = self._create_n_dimension_list(value_type, ndims) + if self.ignore_array_dimensionality: + column_name_without_periods = column_name.replace(".", "_") + base_type_alias = f"{self.namespace}.{column_name_without_periods}" + # Construct a self-referencing list comprised of the array's value + # type and a proxy to the list itself. This allows arrays to be an + # arbitrary number of dimensions, which is how PostgreSQL treats + # lists. See https://github.com/recap-build/recap/issues/264 for + # more details. + base_type = ListType( + alias=base_type_alias, + values=UnionType( + types=[ + value_type, + ProxyType( + alias=base_type_alias, + registry=self.registry, + ), + ], + ), + ) + self.registry.register_alias(base_type) + else: + base_type = self._create_n_dimension_list(value_type, ndims) else: raise ValueError(f"Unknown data type: {data_type}") diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index 3fd3e8a..2c08ca5 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -2,7 +2,7 @@ from recap.clients import create_client from recap.clients.postgresql import PostgresqlClient -from recap.converters.postgresql import MAX_FIELD_SIZE +from recap.converters.postgresql import PostgresqlConverter, MAX_FIELD_SIZE from recap.types import ( BoolType, BytesType, @@ -10,6 +10,8 @@ IntType, ListType, NullType, + ProxyType, + RecapType, StringType, StructType, UnionType, @@ -69,14 +71,211 @@ def teardown_class(cls): # Close the connection cls.connection.close() - def test_struct_method(self): - # Initiate the PostgresqlClient class - client = PostgresqlClient(self.connection) # type: ignore - # Test 'test_types' table + def test_struct_method_arrays_ignore_dimensionality(self): + client = PostgresqlClient(self.connection, PostgresqlConverter(True)) + test_types_struct = client.schema("testdb", "public", "test_types") + + expected_fields = [ + UnionType( + default=None, + name="test_bigint", + types=[NullType(), IntType(bits=64, signed=True)], + ), + UnionType( + default=None, + name="test_integer", + types=[NullType(), IntType(bits=32, signed=True)], + ), + UnionType( + default=None, + name="test_smallint", + types=[NullType(), IntType(bits=16, signed=True)], + ), + UnionType( + default=None, + name="test_float", + types=[NullType(), FloatType(bits=64)], + ), + UnionType( + default=None, + name="test_real", + types=[NullType(), FloatType(bits=32)], + ), + UnionType( + default=None, + name="test_boolean", + types=[NullType(), BoolType()], + ), + UnionType( + default=None, + name="test_text", + types=[NullType(), StringType(bytes_=MAX_FIELD_SIZE, variable=True)], + ), + UnionType( + default=None, + name="test_char", + # 40 = max of 4 bytes in a UTF-8 encoded unicode character * 10 chars + types=[NullType(), StringType(bytes_=40, variable=False)], + ), + UnionType( + default=None, + name="test_bytea", + types=[NullType(), BytesType(bytes_=MAX_FIELD_SIZE, variable=True)], + ), + UnionType( + default=None, + name="test_bit", + types=[NullType(), BytesType(bytes_=2, variable=False)], + ), + UnionType( + default=None, + name="test_timestamp", + types=[ + NullType(), + IntType( + bits=64, logical="build.recap.Timestamp", unit="microsecond" + ), + ], + ), + UnionType( + default=None, + name="test_decimal", + types=[ + NullType(), + BytesType( + logical="build.recap.Decimal", + bytes_=32, + variable=False, + precision=10, + scale=2, + ), + ], + ), + IntType(bits=32, signed=True, name="test_not_null"), + IntType(bits=32, signed=True, name="test_not_null_default", default="1"), + UnionType( + default="2", + name="test_default", + types=[NullType(), IntType(bits=32, signed=True)], + ), + UnionType( + default=None, + name="test_int_array", + types=[ + NullType(), + ListType( + alias="_root.test_int_array", + values=UnionType( + types=[ + IntType(bits=32), + ProxyType( + alias="_root.test_int_array", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + ], + ), + UnionType( + default="'{Hello,World}'::character varying[]", + name="test_varchar_array", + types=[ + NullType(), + ListType( + alias="_root.test_varchar_array", + values=UnionType( + types=[ + StringType(bytes_=MAX_FIELD_SIZE), + ProxyType( + alias="_root.test_varchar_array", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + ], + ), + UnionType( + default=None, + name="test_bit_array", + types=[ + NullType(), + ListType( + alias="_root.test_bit_array", + values=UnionType( + types=[ + BytesType(bytes_=MAX_FIELD_SIZE, variable=False), + ProxyType( + alias="_root.test_bit_array", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + ], + ), + ListType( + name="test_not_null_array", + alias="_root.test_not_null_array", + values=UnionType( + types=[ + IntType(bits=32), + ProxyType( + alias="_root.test_not_null_array", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + UnionType( + default=None, + name="test_int_array_2d", + types=[ + NullType(), + ListType( + alias="_root.test_int_array_2d", + values=UnionType( + types=[ + IntType(bits=32), + ProxyType( + alias="_root.test_int_array_2d", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + ], + ), + UnionType( + default=None, + name="test_text_array_3d", + types=[ + NullType(), + ListType( + alias="_root.test_text_array_3d", + values=UnionType( + types=[ + StringType( + bytes_=MAX_FIELD_SIZE, variable=True + ), + ProxyType( + alias="_root.test_text_array_3d", + registry=client.converter.registry, # type: ignore + ), + ] + ), + ), + ], + ), + ] + validate_results(test_types_struct, expected_fields) + + def test_struct_method_arrays_with_dimensionality(self): + client = PostgresqlClient(self.connection, PostgresqlConverter(False)) # type: ignore test_types_struct = client.schema("testdb", "public", "test_types") - # Define the expected output for 'test_types' table expected_fields = [ UnionType( default=None, @@ -253,12 +452,7 @@ def test_struct_method(self): ], ), ] - - # Going field by field to make debugging easier when test fails - for field, expected_field in zip(test_types_struct.fields, expected_fields): - assert field == expected_field - - assert test_types_struct == StructType(fields=expected_fields) + validate_results(test_types_struct, expected_fields) def test_create_client(self): postgresql_url = "postgresql://postgres:password@localhost:5432/testdb" @@ -272,3 +466,12 @@ def test_create_client(self): "information_schema", ] assert client.ls("testdb", "public") == ["test_types"] + + +def validate_results(test_types_struct: StructType, expected_fields: list[RecapType]) -> None: + # Going field by field to make debugging easier when test fails + for field, expected_field in zip(test_types_struct.fields, expected_fields): + print() + assert field == expected_field + + assert test_types_struct == StructType(fields=expected_fields) diff --git a/tests/unit/converters/test_postgresql.py b/tests/unit/converters/test_postgresql.py index db55238..f37de43 100644 --- a/tests/unit/converters/test_postgresql.py +++ b/tests/unit/converters/test_postgresql.py @@ -8,6 +8,7 @@ IntType, ListType, NullType, + ProxyType, StringType, UnionType, ) @@ -244,9 +245,8 @@ def test_postgresql_converter(column_props, expected): result = PostgresqlConverter()._parse_type(column_props) assert result == expected - -def test_postgresql_converter_array(): - converter = PostgresqlConverter() +def test_postgresql_converter_array_with_dimensionality(): + converter = PostgresqlConverter(False) column_props = { "COLUMN_NAME": "test_column", "DATA_TYPE": "array", @@ -267,3 +267,30 @@ def test_postgresql_converter_array(): ) result = converter._parse_type(column_props) assert result == expected + +def test_postgresql_converter_array_ignore_dimensionality(): + converter = PostgresqlConverter(True) + column_props = { + "COLUMN_NAME": "test_column", + "DATA_TYPE": "array", + "CHARACTER_MAXIMUM_LENGTH": None, + "CHARACTER_OCTET_LENGTH": None, + "NUMERIC_PRECISION": 5, + "NUMERIC_SCALE": 0, + "UDT_NAME": "_int4", + "ATTNDIMS": 1, + } + expected = ListType( + alias="_root.test_column", + values=UnionType( + types=[ + IntType(bits=32, signed=True), + ProxyType( + alias="_root.test_column", + registry=converter.registry, + ), + ], + ), + ) + result = converter._parse_type(column_props) + assert result == expected From 09dc94966fa0bb60dc007c765994a70006102cbb Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Mon, 25 Dec 2023 18:48:45 -0800 Subject: [PATCH 5/7] style and typing --- recap/clients/postgresql.py | 6 +++++- recap/converters/postgresql.py | 9 +++++++-- tests/integration/clients/test_postgresql.py | 21 ++++++++++---------- tests/unit/converters/test_postgresql.py | 2 ++ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/recap/clients/postgresql.py b/recap/clients/postgresql.py index be6f950..d35a8d0 100644 --- a/recap/clients/postgresql.py +++ b/recap/clients/postgresql.py @@ -49,7 +49,11 @@ class PostgresqlClient(DbapiClient): - def __init__(self, connection: Connection, converter = PostgresqlConverter()) -> None: + def __init__( + self, + connection: Connection, + converter: PostgresqlConverter = PostgresqlConverter(), + ) -> None: super().__init__(connection, converter) @staticmethod diff --git a/recap/converters/postgresql.py b/recap/converters/postgresql.py index 2f0eced..607a73b 100644 --- a/recap/converters/postgresql.py +++ b/recap/converters/postgresql.py @@ -1,5 +1,5 @@ from math import ceil -from typing import Any, Union +from typing import Any from recap.converters.dbapi import DbapiConverter from recap.types import ( @@ -23,8 +23,13 @@ Namespace to use when no namespace is specified in the schema. """ + class PostgresqlConverter(DbapiConverter): - def __init__(self, ignore_array_dimensionality = True, namespace: str = DEFAULT_NAMESPACE): + def __init__( + self, + ignore_array_dimensionality: bool = True, + namespace: str = DEFAULT_NAMESPACE, + ): # since array dimensionality is not enforced by PG schemas: # if `ignore_array_dimensionality = True` then read arrays irrespective of how many dimensions they have # if `ignore_array_dimensionality = False` then read arrays as nested lists diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index 2c08ca5..a33d47a 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -2,7 +2,7 @@ from recap.clients import create_client from recap.clients.postgresql import PostgresqlClient -from recap.converters.postgresql import PostgresqlConverter, MAX_FIELD_SIZE +from recap.converters.postgresql import MAX_FIELD_SIZE, PostgresqlConverter from recap.types import ( BoolType, BytesType, @@ -71,7 +71,6 @@ def teardown_class(cls): # Close the connection cls.connection.close() - def test_struct_method_arrays_ignore_dimensionality(self): client = PostgresqlClient(self.connection, PostgresqlConverter(True)) test_types_struct = client.schema("testdb", "public", "test_types") @@ -248,7 +247,7 @@ def test_struct_method_arrays_ignore_dimensionality(self): ), ], ), - UnionType( + UnionType( default=None, name="test_text_array_3d", types=[ @@ -257,9 +256,7 @@ def test_struct_method_arrays_ignore_dimensionality(self): alias="_root.test_text_array_3d", values=UnionType( types=[ - StringType( - bytes_=MAX_FIELD_SIZE, variable=True - ), + StringType(bytes_=MAX_FIELD_SIZE, variable=True), ProxyType( alias="_root.test_text_array_3d", registry=client.converter.registry, # type: ignore @@ -468,10 +465,12 @@ def test_create_client(self): assert client.ls("testdb", "public") == ["test_types"] -def validate_results(test_types_struct: StructType, expected_fields: list[RecapType]) -> None: +def validate_results( + test_types_struct: StructType, expected_fields: list[RecapType] +) -> None: # Going field by field to make debugging easier when test fails - for field, expected_field in zip(test_types_struct.fields, expected_fields): - print() - assert field == expected_field + for field, expected_field in zip(test_types_struct.fields, expected_fields): + print() + assert field == expected_field - assert test_types_struct == StructType(fields=expected_fields) + assert test_types_struct == StructType(fields=expected_fields) diff --git a/tests/unit/converters/test_postgresql.py b/tests/unit/converters/test_postgresql.py index f37de43..fc234ab 100644 --- a/tests/unit/converters/test_postgresql.py +++ b/tests/unit/converters/test_postgresql.py @@ -245,6 +245,7 @@ def test_postgresql_converter(column_props, expected): result = PostgresqlConverter()._parse_type(column_props) assert result == expected + def test_postgresql_converter_array_with_dimensionality(): converter = PostgresqlConverter(False) column_props = { @@ -268,6 +269,7 @@ def test_postgresql_converter_array_with_dimensionality(): result = converter._parse_type(column_props) assert result == expected + def test_postgresql_converter_array_ignore_dimensionality(): converter = PostgresqlConverter(True) column_props = { From 2cc2753c3891b6d3e9d4144f0f44b707e1d13555 Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Mon, 25 Dec 2023 18:51:36 -0800 Subject: [PATCH 6/7] remove superfluous print() I missed --- tests/integration/clients/test_postgresql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index a33d47a..e22d181 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -470,7 +470,6 @@ def validate_results( ) -> None: # Going field by field to make debugging easier when test fails for field, expected_field in zip(test_types_struct.fields, expected_fields): - print() assert field == expected_field assert test_types_struct == StructType(fields=expected_fields) From 1c97a1e6c7da092998896ca8327ec41411d640a0 Mon Sep 17 00:00:00 2001 From: Alex DeMeo Date: Sat, 30 Dec 2023 11:28:10 -0800 Subject: [PATCH 7/7] rename ignore_array_dimensionality -> enforce_array_dimensions --- recap/converters/postgresql.py | 14 +++++++------- tests/integration/clients/test_postgresql.py | 8 ++++---- tests/unit/converters/test_postgresql.py | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/recap/converters/postgresql.py b/recap/converters/postgresql.py index 607a73b..e5f11f5 100644 --- a/recap/converters/postgresql.py +++ b/recap/converters/postgresql.py @@ -27,13 +27,13 @@ class PostgresqlConverter(DbapiConverter): def __init__( self, - ignore_array_dimensionality: bool = True, + enforce_array_dimensions: bool = False, namespace: str = DEFAULT_NAMESPACE, ): # since array dimensionality is not enforced by PG schemas: - # if `ignore_array_dimensionality = True` then read arrays irrespective of how many dimensions they have - # if `ignore_array_dimensionality = False` then read arrays as nested lists - self.ignore_array_dimensionality = ignore_array_dimensionality + # if `enforce_array_dimensions = False` then read arrays irrespective of how many dimensions they have + # if `enforce_array_dimensions = True` then read arrays as nested lists + self.enforce_array_dimensions = enforce_array_dimensions self.namespace = namespace self.registry = RecapTypeRegistry() @@ -115,7 +115,9 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: "ATTNDIMS": 0, } ) - if self.ignore_array_dimensionality: + if self.enforce_array_dimensions: + base_type = self._create_n_dimension_list(value_type, ndims) + else: column_name_without_periods = column_name.replace(".", "_") base_type_alias = f"{self.namespace}.{column_name_without_periods}" # Construct a self-referencing list comprised of the array's value @@ -136,8 +138,6 @@ def _parse_type(self, column_props: dict[str, Any]) -> RecapType: ), ) self.registry.register_alias(base_type) - else: - base_type = self._create_n_dimension_list(value_type, ndims) else: raise ValueError(f"Unknown data type: {data_type}") diff --git a/tests/integration/clients/test_postgresql.py b/tests/integration/clients/test_postgresql.py index e22d181..27c70e0 100644 --- a/tests/integration/clients/test_postgresql.py +++ b/tests/integration/clients/test_postgresql.py @@ -71,8 +71,8 @@ def teardown_class(cls): # Close the connection cls.connection.close() - def test_struct_method_arrays_ignore_dimensionality(self): - client = PostgresqlClient(self.connection, PostgresqlConverter(True)) + def test_struct_method_arrays_no_enforce_dimensions(self): + client = PostgresqlClient(self.connection, PostgresqlConverter(False)) test_types_struct = client.schema("testdb", "public", "test_types") expected_fields = [ @@ -269,8 +269,8 @@ def test_struct_method_arrays_ignore_dimensionality(self): ] validate_results(test_types_struct, expected_fields) - def test_struct_method_arrays_with_dimensionality(self): - client = PostgresqlClient(self.connection, PostgresqlConverter(False)) # type: ignore + def test_struct_method_arrays_enforce_dimensions(self): + client = PostgresqlClient(self.connection, PostgresqlConverter(True)) # type: ignore test_types_struct = client.schema("testdb", "public", "test_types") expected_fields = [ diff --git a/tests/unit/converters/test_postgresql.py b/tests/unit/converters/test_postgresql.py index fc234ab..4970cf8 100644 --- a/tests/unit/converters/test_postgresql.py +++ b/tests/unit/converters/test_postgresql.py @@ -246,8 +246,8 @@ def test_postgresql_converter(column_props, expected): assert result == expected -def test_postgresql_converter_array_with_dimensionality(): - converter = PostgresqlConverter(False) +def test_postgresql_converter_array_enforce_dimensions(): + converter = PostgresqlConverter(True) column_props = { "COLUMN_NAME": "test_column", "DATA_TYPE": "array", @@ -270,8 +270,8 @@ def test_postgresql_converter_array_with_dimensionality(): assert result == expected -def test_postgresql_converter_array_ignore_dimensionality(): - converter = PostgresqlConverter(True) +def test_postgresql_converter_array_no_enforce_dimensions(): + converter = PostgresqlConverter(False) column_props = { "COLUMN_NAME": "test_column", "DATA_TYPE": "array",