From e2a91c9cb05d4492686ab380bcc4eaadc4d304a5 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 27 Jun 2024 11:40:36 -0800 Subject: [PATCH] feat(api): make struct() idempotent, add `type` option Part of https://github.com/ibis-project/ibis/issues/8289 Depends on the array API getting updated --- ibis/backends/exasol/compiler.py | 1 + ibis/backends/sqlite/compiler.py | 1 + ibis/backends/tests/test_struct.py | 77 ++++++++++++++++++++++++------ ibis/expr/types/structs.py | 75 ++++++++++++++++++++--------- ibis/tests/expr/test_literal.py | 7 +-- 5 files changed, 118 insertions(+), 43 deletions(-) diff --git a/ibis/backends/exasol/compiler.py b/ibis/backends/exasol/compiler.py index c752f9f9068d..10307b1a7a28 100644 --- a/ibis/backends/exasol/compiler.py +++ b/ibis/backends/exasol/compiler.py @@ -73,6 +73,7 @@ class ExasolCompiler(SQLGlotCompiler): ops.StringSplit, ops.StringToDate, ops.StringToTimestamp, + ops.StructColumn, ops.TimeDelta, ops.TimestampAdd, ops.TimestampBucket, diff --git a/ibis/backends/sqlite/compiler.py b/ibis/backends/sqlite/compiler.py index 805c7dfb5b47..549f769ae102 100644 --- a/ibis/backends/sqlite/compiler.py +++ b/ibis/backends/sqlite/compiler.py @@ -62,6 +62,7 @@ class SQLiteCompiler(SQLGlotCompiler): ops.TimestampDiff, ops.StringToDate, ops.StringToTimestamp, + ops.StructColumn, ops.TimeDelta, ops.DateDelta, ops.TimestampDelta, diff --git a/ibis/backends/tests/test_struct.py b/ibis/backends/tests/test_struct.py index 6a7429a6c2ff..de0c904cfcba 100644 --- a/ibis/backends/tests/test_struct.py +++ b/ibis/backends/tests/test_struct.py @@ -14,12 +14,14 @@ import ibis.expr.datatypes as dt from ibis import util from ibis.backends.tests.errors import ( + ClickHouseDatabaseError, PolarsColumnNotFoundError, PsycoPg2InternalError, PsycoPg2SyntaxError, Py4JJavaError, PySparkAnalysisException, ) +from ibis.common.annotations import ValidationError from ibis.common.exceptions import IbisError pytestmark = [ @@ -28,6 +30,62 @@ pytest.mark.notimpl(["datafusion", "druid", "oracle", "exasol"]), ] +mark_notimpl_postgres_literals = pytest.mark.notimpl( + "postgres", reason="struct literals not implemented", raises=PsycoPg2SyntaxError +) + + +@pytest.mark.notimpl("risingwave") +@pytest.mark.broken("postgres", reason="JSON handling is buggy") +@pytest.mark.notimpl( + "flink", + raises=Py4JJavaError, + reason="Unexpected error in type inference logic of function 'COALESCE'", +) +def test_struct_factory(con): + s = ibis.struct({"a": 1, "b": 2}) + assert con.execute(s) == {"a": 1, "b": 2} + + s2 = ibis.struct(s) + assert con.execute(s2) == {"a": 1, "b": 2} + + typed = ibis.struct({"a": 1, "b": 2}, type="struct") + assert con.execute(typed) == {"a": "1", "b": "2"} + + typed2 = ibis.struct(s, type="struct") + assert con.execute(typed2) == {"a": "1", "b": "2"} + + items = ibis.struct([("a", 1), ("b", 2)]) + assert con.execute(items) == {"a": 1, "b": 2} + + +@pytest.mark.parametrize("type", ["struct<>", "struct"]) +@pytest.mark.parametrize("val", [{}, []]) +def test_struct_factory_empty(val, type): + with pytest.raises(ValidationError): + ibis.struct(val, type=type) + + +@pytest.mark.notimpl("risingwave") +@mark_notimpl_postgres_literals +@pytest.mark.notyet( + "clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL" +) +@pytest.mark.broken( + "polars", + reason=r"pl.lit(None, type='struct') gives {'a': None}: https://github.com/pola-rs/polars/issues/3462", +) +def test_struct_factory_null(con): + with pytest.raises(ValidationError): + ibis.struct(None) + none_typed = ibis.struct(None, type="struct") + assert none_typed.type() == dt.Struct(fields={"a": dt.float64, "b": dt.float64}) + assert con.execute(none_typed) is None + # Execute a real value here, so the backends that don't support structs + # actually xfail as we expect them to. + # Otherwise would have to @mark.xfail every test in this file besides this one. + assert con.execute(ibis.struct({"a": 1, "b": 2})) == {"a": 1, "b": 2} + @pytest.mark.notimpl(["dask"]) @pytest.mark.parametrize( @@ -78,6 +136,9 @@ def test_all_fields(struct, struct_df): @pytest.mark.notimpl(["postgres", "risingwave"]) @pytest.mark.parametrize("field", ["a", "b", "c"]) +@pytest.mark.notyet( + ["flink"], reason="flink doesn't support creating struct columns from literals" +) def test_literal(backend, con, field): query = _STRUCT_LITERAL[field] dtype = query.type().to_pandas() @@ -87,7 +148,7 @@ def test_literal(backend, con, field): backend.assert_series_equal(result, expected.astype(dtype)) -@pytest.mark.notimpl(["postgres"]) +@mark_notimpl_postgres_literals @pytest.mark.parametrize("field", ["a", "b", "c"]) @pytest.mark.notyet( ["clickhouse"], reason="clickhouse doesn't support nullable nested types" @@ -137,14 +198,6 @@ def test_collect_into_struct(alltypes): assert len(val.loc[result.group == "1"].iat[0]["key"]) == 730 -@pytest.mark.notimpl( - ["postgres"], reason="struct literals not implemented", raises=PsycoPg2SyntaxError -) -@pytest.mark.notimpl( - ["risingwave"], - reason="struct literals not implemented", - raises=PsycoPg2InternalError, -) @pytest.mark.notimpl(["flink"], raises=Py4JJavaError, reason="not implemented in ibis") def test_field_access_after_case(con): s = ibis.struct({"a": 3}) @@ -240,12 +293,6 @@ def test_keyword_fields(con, nullable): raises=PolarsColumnNotFoundError, reason="doesn't seem to support IN-style subqueries on structs", ) -@pytest.mark.notimpl( - # https://github.com/pandas-dev/pandas/issues/58909 - ["pandas", "dask"], - raises=TypeError, - reason="unhashable type: 'dict'", -) @pytest.mark.xfail_version( pyspark=["pyspark<3.5"], reason="requires pyspark 3.5", diff --git a/ibis/expr/types/structs.py b/ibis/expr/types/structs.py index 7de4c2845190..59d0d00cd669 100644 --- a/ibis/expr/types/structs.py +++ b/ibis/expr/types/structs.py @@ -1,28 +1,33 @@ from __future__ import annotations -import collections from keyword import iskeyword from typing import TYPE_CHECKING from public import public +import ibis.expr.datatypes as dt import ibis.expr.operations as ops +import ibis.expr.types as ir +from ibis.common.annotations import ValidationError from ibis.common.deferred import deferrable from ibis.common.exceptions import IbisError -from ibis.expr.types.generic import Column, Scalar, Value, literal +from ibis.expr.types.generic import Column, Scalar, Value if TYPE_CHECKING: from collections.abc import Iterable, Mapping, Sequence - import ibis.expr.datatypes as dt - import ibis.expr.types as ir from ibis.expr.types.typing import V @public @deferrable def struct( - value: Iterable[tuple[str, V]] | Mapping[str, V], + value: Iterable[tuple[str, V]] + | Mapping[str, V] + | StructValue + | ir.NullValue + | None, + *, type: str | dt.DataType | None = None, ) -> StructValue: """Create a struct expression. @@ -37,8 +42,7 @@ def struct( `(str, Value)`. type An instance of `ibis.expr.datatypes.DataType` or a string indicating - the Ibis type of `value`. This is only used if all of the input values - are Python literals. eg `struct`. + the Ibis type of `value`. eg `struct`. Returns ------- @@ -66,26 +70,51 @@ def struct( Create a struct column from a column and a scalar literal >>> t = ibis.memtable({"a": [1, 2, 3]}) - >>> ibis.struct([("a", t.a), ("b", "foo")]) - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ StructColumn() ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ struct │ - ├─────────────────────────────┤ - │ {'a': 1, 'b': 'foo'} │ - │ {'a': 2, 'b': 'foo'} │ - │ {'a': 3, 'b': 'foo'} │ - └─────────────────────────────┘ + >>> ibis.struct([("a", t.a), ("b", "foo")], type="struct") + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Cast(StructColumn(), struct) ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ struct │ + ├─────────────────────────────────────────────────────┤ + │ {'a': 1.0, 'b': 'foo'} │ + │ {'a': 2.0, 'b': 'foo'} │ + │ {'a': 3.0, 'b': 'foo'} │ + └─────────────────────────────────────────────────────┘ """ import ibis.expr.operations as ops + type = dt.dtype(type) if type is not None else None + if type is not None and not isinstance(type, dt.Struct): + raise ValidationError(f"type must be an struct, got {type}") + + if isinstance(value, ir.Value): + if type is not None: + return value.cast(type) + elif isinstance(value, StructValue): + return value + else: + raise ValidationError( + f"If no type passed, value must be a struct, got {value.type()}" + ) + + if value is None: + if type is None: + raise ValidationError("If values is None/NULL, type must be provided") + return ir.null(type) + fields = dict(value) - if any(isinstance(value, Value) for value in fields.values()): - names = tuple(fields.keys()) - values = tuple(fields.values()) - return ops.StructColumn(names=names, values=values).to_expr() - else: - return literal(collections.OrderedDict(fields), type=type) + if not fields: + raise ValidationError("Struct must have at least one field") + names = fields.keys() + result = ops.StructColumn(names=names, values=fields.values()).to_expr() + if type is not None: + if not set(names).issuperset(type.names): + raise ValidationError( + f"The passed type requires fields {type.names}", + f" but only found fields {names}", + ) + result = result.cast(type) + return result @public diff --git a/ibis/tests/expr/test_literal.py b/ibis/tests/expr/test_literal.py index 0893a7eec4bc..cdec03fd3221 100644 --- a/ibis/tests/expr/test_literal.py +++ b/ibis/tests/expr/test_literal.py @@ -8,7 +8,7 @@ import ibis import ibis.expr.datatypes as dt -from ibis.common.collections import frozendict +from ibis.common.annotations import ValidationError from ibis.expr.operations import Literal from ibis.tests.util import assert_pickle_roundtrip @@ -109,9 +109,6 @@ def test_normalized_underlying_value(userinput, literal_type, expected_type): def test_struct_literal(value): typestr = "struct" a = ibis.struct(value, type=typestr) - assert a.op().value == frozendict( - field1=str(value["field1"]), field2=float(value["field2"]) - ) assert a.type() == dt.dtype(typestr) @@ -123,7 +120,7 @@ def test_struct_literal(value): ], ) def test_struct_literal_non_castable(value): - with pytest.raises(TypeError, match="Unable to normalize"): + with pytest.raises(ValidationError): ibis.struct(value, type="struct")