diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py index a3153d17b8b47..868a08a98dfc8 100644 --- a/ibis/backends/pandas/executor.py +++ b/ibis/backends/pandas/executor.py @@ -49,7 +49,9 @@ def visit(cls, op: ops.Node, **kwargs): @classmethod def visit(cls, op: ops.Literal, value, dtype): - if dtype.is_interval(): + if value is None: + value = None + elif dtype.is_interval(): value = pd.Timedelta(value, dtype.unit.short) elif dtype.is_array(): value = np.array(value) @@ -219,6 +221,11 @@ def visit(cls, op: ops.FindInSet, needle, values): result = np.select(condlist, choicelist, default=-1) return pd.Series(result, name=op.name) + @classmethod + def visit(cls, op: ops.EmptyArray, dtype): + pdt = PandasType.from_ibis(dtype) + return np.array([], dtype=pdt) + @classmethod def visit(cls, op: ops.Array, exprs): return cls.rowwise(lambda row: np.array(row, dtype=object), exprs) diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 4d9a497191b4a..08f4b46ffb3ea 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -87,12 +87,7 @@ def literal(op, **_): value = op.value dtype = op.dtype - if dtype.is_array(): - value = pl.Series("", value) - typ = PolarsType.from_ibis(dtype) - val = pl.lit(value, dtype=typ) - return val.implode() - elif dtype.is_struct(): + if dtype.is_struct(): values = [ pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k) for k, v in value.items() @@ -106,7 +101,7 @@ def literal(op, **_): return pl.lit(value) else: typ = PolarsType.from_ibis(dtype) - return pl.lit(op.value, dtype=typ) + return pl.lit(value, dtype=typ) _TIMESTAMP_SCALE_TO_UNITS = { @@ -973,6 +968,12 @@ def array_concat(op, **kw): return result +@translate.register(ops.EmptyArray) +def empty_array(op, **kw): + pdt = PolarsType.from_ibis(op.dtype) + return pl.lit([], dtype=pdt) + + @translate.register(ops.Array) def array_column(op, **kw): cols = [translate(col, **kw) for col in op.exprs] diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py index 09f26b80d62fe..18699aa560e54 100644 --- a/ibis/backends/sql/compiler.py +++ b/ibis/backends/sql/compiler.py @@ -1019,6 +1019,9 @@ def visit_InSubquery(self, op, *, rel, needle): query = sg.select(STAR).from_(query) return needle.isin(query=query) + def visit_EmptyArray(self, op, *, dtype): + return self.cast(self.f.array(), dtype) + def visit_Array(self, op, *, exprs): return self.f.array(*exprs) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 8b55c189e9d6b..fb46d5357d244 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -31,6 +31,7 @@ PySparkAnalysisException, TrinoUserError, ) +from ibis.common.annotations import ValidationError from ibis.common.collections import frozendict pytestmark = [ @@ -72,6 +73,71 @@ # list. +def test_array_factory(con): + a = ibis.array([1, 2, 3]) + assert a.type() == dt.Array(value_type=dt.Int8) + assert con.execute(a) == [1, 2, 3] + + a2 = ibis.array(a) + assert a.type() == dt.Array(value_type=dt.Int8) + assert con.execute(a2) == [1, 2, 3] + + +def test_array_factory_typed(con): + typed = ibis.array([1, 2, 3], type="array") + assert con.execute(typed) == ["1", "2", "3"] + + typed2 = ibis.array(ibis.array([1, 2, 3]), type="array") + assert con.execute(typed2) == ["1", "2", "3"] + + +@pytest.mark.notimpl("flink", raises=Py4JJavaError) +def test_array_factory_empty(con): + with pytest.raises(ValidationError): + ibis.array([]) + + empty_typed = ibis.array([], type="array") + assert empty_typed.type() == dt.Array(value_type=dt.string) + assert con.execute(empty_typed) == [] + + +@pytest.mark.notyet( + "clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL" +) +def test_array_factory_null(con): + with pytest.raises(ValidationError): + ibis.array(None) + with pytest.raises(ValidationError): + ibis.array(None, type="int64") + none_typed = ibis.array(None, type="array") + assert none_typed.type() == dt.Array(value_type=dt.string) + assert con.execute(none_typed) is None + + nones = ibis.array([None, None], type="array") + assert nones.type() == dt.Array(value_type=dt.string) + assert con.execute(nones) == [None, None] + + # Execute a real value here, so the backends that don't support arrays + # actually xfail as we expect them to. + # Otherwise would have to @mark.xfail every test in this file besides this one. + assert con.execute(ibis.array([1, 2])) == [1, 2] + + +@pytest.mark.broken( + ["datafusion", "flink", "polars"], + raises=AssertionError, + reason="[None, 1] executes to [np.nan, 1.0]", +) +def test_array_factory_null_mixed(con): + none_and_val = ibis.array([None, 1]) + assert none_and_val.type() == dt.Array(value_type=dt.Int8) + assert con.execute(none_and_val) == [None, 1] + + none_and_val_typed = ibis.array([None, 1], type="array") + assert none_and_val_typed.type() == dt.Array(value_type=dt.String) + assert con.execute(none_and_val_typed) == [None, "1"] + + def test_array_column(backend, alltypes, df): expr = ibis.array( [alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)] @@ -1356,7 +1422,7 @@ def test_unnest_range(con): pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest), pytest.mark.broken( ["polars"], - reason="expression input not supported with nested arrays", + reason="upstream polars bug: https://github.com/pola-rs/polars/issues/17294", raises=TypeError, ), ], diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 3ed4a9db8cc5e..9f9a5ce5e4c96 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1431,13 +1431,12 @@ def query(t, group_cols): snapshot.assert_match(str(ibis.to_sql(t3, dialect=con.name)), "out.sql") -@pytest.mark.notimpl(["oracle", "exasol"], raises=com.OperationNotDefinedError) -@pytest.mark.notimpl(["druid"], raises=AssertionError) @pytest.mark.notyet( - ["datafusion", "impala", "mssql", "mysql", "sqlite"], + ["datafusion", "exasol", "impala", "mssql", "mysql", "oracle", "sqlite"], reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet", - raises=com.OperationNotDefinedError, + raises=(com.OperationNotDefinedError, com.UnsupportedBackendType), ) +@pytest.mark.notimpl(["druid"], raises=AssertionError) @pytest.mark.broken( ["trino"], reason="invalid code generated for unnesting a struct", diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 777cfa3db8bb3..1f3f8cb69ad5a 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -30,8 +30,18 @@ ibis.struct(dict(abc=432)), marks=[ pytest.mark.never( - ["impala", "mysql", "sqlite", "mssql", "exasol"], - raises=(NotImplementedError, exc.UnsupportedBackendType), + [ + "exasol", + "impala", + "mysql", + "sqlite", + "mssql", + ], + raises=( + exc.OperationNotDefinedError, + NotImplementedError, + exc.UnsupportedBackendType, + ), reason="structs not supported in the backend", ), pytest.mark.notimpl( @@ -104,7 +114,7 @@ def test_isin_bug(con, snapshot): @pytest.mark.notyet( ["datafusion", "exasol", "oracle", "flink", "risingwave"], reason="no unnest support", - raises=exc.OperationNotDefinedError, + raises=(exc.OperationNotDefinedError, exc.UnsupportedBackendType), ) @pytest.mark.notyet( ["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream" diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index ceb9fdc77711b..d56f5f934024c 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -835,6 +835,11 @@ def test_capitalize(con, inp, expected): assert pd.isnull(result) +@pytest.mark.never( + ["exasol", "impala", "mssql", "mysql", "sqlite"], + reason="Backend doesn't support arrays", + raises=(com.OperationNotDefinedError, com.UnsupportedBackendType), +) @pytest.mark.notimpl( [ "dask", @@ -842,11 +847,6 @@ def test_capitalize(con, inp, expected): "polars", "oracle", "flink", - "sqlite", - "mssql", - "mysql", - "exasol", - "impala", ], raises=com.OperationNotDefinedError, ) diff --git a/ibis/expr/operations/arrays.py b/ibis/expr/operations/arrays.py index 6d68baab94c32..c075c27dfd200 100644 --- a/ibis/expr/operations/arrays.py +++ b/ibis/expr/operations/arrays.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional +from typing import TYPE_CHECKING, Annotated, Optional from public import public @@ -13,16 +13,25 @@ from ibis.common.typing import VarTuple # noqa: TCH001 from ibis.expr.operations.core import Unary, Value +if TYPE_CHECKING: + from ibis.common.patterns import Length + + +@public +class EmptyArray(Value): + """Construct an array with 0 elements.""" + + dtype: dt.Array + shape = ds.scalar + @public class Array(Value): - """Construct an array.""" + """Construct an array with 1+ elements. Use `EmptyArray` for empty arrays.""" - exprs: VarTuple[Value] + exprs: Annotated[VarTuple[Value], Length(at_least=1)] - @attribute - def shape(self): - return rlz.highest_precedence_shape(self.exprs) + shape = rlz.shape_like("exprs") @attribute def dtype(self): diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 2d9e5a8f5b3a3..c4116ed188e0a 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -5,14 +5,17 @@ from public import public +import ibis +import ibis.expr.datatypes as dt import ibis.expr.operations as ops +import ibis.expr.types as ir +from ibis.common.annotations import ValidationError from ibis.common.deferred import Deferred, deferrable from ibis.expr.types.generic import Column, Scalar, Value if TYPE_CHECKING: from collections.abc import Callable, Iterable - import ibis.expr.types as ir from ibis.expr.types.typing import V import ibis.common.exceptions as com @@ -1067,7 +1070,11 @@ def __getitem__(self, index: int | ir.IntegerValue | slice) -> ir.Column: @public @deferrable -def array(values: Iterable[V]) -> ArrayValue: +def array( + values: ArrayValue | Iterable[V] | ir.NullValue | None, + *, + type: str | dt.DataType | None = None, +) -> ArrayValue: """Create an array expression. If any values are [column expressions](../concepts/datatypes.qmd) the @@ -1078,6 +1085,9 @@ def array(values: Iterable[V]) -> ArrayValue: ---------- values An iterable of Ibis expressions or Python literals + type + An instance of `ibis.expr.datatypes.DataType` or a string indicating + the Ibis type of `value`. eg `array`. Returns ------- @@ -1108,15 +1118,49 @@ def array(values: Iterable[V]) -> ArrayValue: │ [3, 42, ... +1] │ └──────────────────────┘ - >>> ibis.array([t.a, 42 + ibis.literal(5)]) + >>> ibis.array([t.a, 42 + ibis.literal(5)], type="array") ┏━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ Array() ┃ + ┃ Array( ) ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ - │ array │ + │ array │ ├──────────────────────┤ - │ [1, 47] │ - │ [2, 47] │ - │ [3, 47] │ + │ [1.0, 47.0] │ + │ [2.0, 47.0] │ + │ [3.0, 47.0] │ └──────────────────────┘ """ - return ops.Array(tuple(values)).to_expr() + type = dt.dtype(type) if type is not None else None + if type is not None and not isinstance(type, dt.Array): + raise ValidationError(f"type must be an array, got {type}") + + if isinstance(values, ir.Value): + if type is not None: + return values.cast(type) + elif isinstance(values, ArrayValue): + return values + else: + raise ValidationError( + f"If no type passed, values must be an array, got {values.type()}" + ) + + if values is None: + if type is None: + raise ValidationError("If values is None/NULL, type must be provided") + return ir.null(type) + + values = tuple(values) + if len(values) == 0: + if type is None: + raise ValidationError("If values is empty, type must be provided") + return ops.EmptyArray(type).to_expr() + else: + value_type = type.value_type if type is not None else None + values = [_value(v, value_type) for v in values] + return ops.Array(values).to_expr() + + +def _value(x, type) -> ir.Value: + if isinstance(x, (ir.Value, Deferred)): + return x.cast(type) if type is not None else x + else: + return ibis.literal(x, type=type)