From df7bab84df20575c60f669fc176945b24ef57476 Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Mon, 18 Nov 2024 12:09:46 -0600 Subject: [PATCH 1/6] feat: add datafusion map methods to datafusion compiler fmt --- ibis/backends/sql/compilers/datafusion.py | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index 449bfd78aef9..ef2ffd3d6024 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -67,6 +67,10 @@ class DataFusionCompiler(SQLGlotCompiler): ops.EndsWith: "ends_with", ops.ArrayIntersect: "array_intersect", ops.ArrayUnion: "array_union", + ops.MapKeys: "map_keys", + ops.MapValues: "map_values", + ops.MapLength: "cardinality", + ops.IsNull: "ifnull", } def _to_timestamp(self, value, target_dtype, literal=False): @@ -533,5 +537,27 @@ def visit_ArrayFlatten(self, op, *, arg): def visit_RandomUUID(self, op, **kw): return self.f.anon.uuid() + def visit_MapContains(self, op, *, arg, key): + return self.if_( + sg.or_(arg.is_(NULL), key.is_(NULL)), + NULL, + sge.NEQ(self.f.cardinality(self.f.map_extract(arg, key)) != 0), + ) + + def visit_MapGet(self, op, *, arg, key, default): + return self.if_( + sg.or_(arg.is_(NULL), key.is_(NULL)), + NULL, + self.f.ifnull( + self.f.list_extract(self.f.map_extract(arg, key), 1), + default, + ), + ) + + +# def visit_MapKeys(self, op, *, arg): +# return self.if_(arg.is_(NULL), NULL, self.f.map_keys(arg)) +# ops.MapMerge: "mapUpdate", ## need to implement this as a visitor node + compiler = DataFusionCompiler() From f1890fff1e50955edc97b7176cad5c84261c147b Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Wed, 20 Nov 2024 02:43:31 -0600 Subject: [PATCH 2/6] feat(datafusion): remove pytest not yet annotations, fix fmt --- ibis/backends/sql/compilers/datafusion.py | 7 +++--- ibis/backends/tests/test_map.py | 27 ----------------------- 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index 0872581195a7..768e70738edd 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -544,7 +544,7 @@ def visit_ArrayConcat(self, op, *, arg): ), map(partial(self.cast, to=op.dtype), arg), ) - + def visit_MapGet(self, op, *, arg, key, default): return self.if_( sg.or_(arg.is_(NULL), key.is_(NULL)), @@ -559,9 +559,10 @@ def visit_MapContains(self, op, *, arg, key): return self.if_( sg.or_(arg.is_(NULL), key.is_(NULL)), NULL, - sge.NEQ(self.f.cardinality(self.f.map_extract(arg, key)) != 0), + self.f.list_contains(self.f.map_keys(arg), key), ) - + # ops.MapMerge: "mapUpdate", ## need to implement this as a visitor node + compiler = DataFusionCompiler() diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index f3f961d386b6..d4604ef8763b 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -58,7 +58,6 @@ param(None, None, id="null_both"), ], ) -@mark_notyet_datafusion def test_map_nulls(con, k, v): k = ibis.literal(k, type="array") v = ibis.literal(v, type="array") @@ -79,7 +78,6 @@ def test_map_nulls(con, k, v): param(None, None, id="null_both"), ], ) -@mark_notyet_datafusion def test_map_keys_nulls(con, k, v): k = ibis.literal(k, type="array") v = ibis.literal(v, type="array") @@ -112,7 +110,6 @@ def test_map_keys_nulls(con, k, v): param(ibis.literal(None, type="map"), id="null_map"), ], ) -@mark_notyet_datafusion def test_map_values_nulls(con, map): assert con.execute(map.values()) is None @@ -181,7 +178,6 @@ def test_map_values_nulls(con, map): ], ) @pytest.mark.parametrize("method", ["get", "contains"]) -@mark_notyet_datafusion def test_map_get_contains_nulls(con, map, key, method): expr = getattr(map, method) assert con.execute(expr(key)) is None @@ -219,7 +215,6 @@ def test_map_merge_nulls(con, m1, m2): assert con.execute(concatted) is None -@mark_notyet_datafusion def test_map_table(backend): table = backend.map assert table.kv.type().is_map() @@ -227,7 +222,6 @@ def test_map_table(backend): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_column_map_values(backend): table = backend.map expr = table.select("idx", vals=table.kv.values()).order_by("idx") @@ -254,7 +248,6 @@ def test_column_map_merge(backend): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_literal_map_keys(con): mapping = ibis.literal({"1": "a", "2": "b"}) expr = mapping.keys().name("tmp") @@ -266,7 +259,6 @@ def test_literal_map_keys(con): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_literal_map_values(con): mapping = ibis.literal({"1": "a", "2": "b"}) expr = mapping.values().name("tmp") @@ -277,7 +269,6 @@ def test_literal_map_values(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_scalar_isin_literal_map_keys(con): mapping = ibis.literal({"a": 1, "b": 2}) a = ibis.literal("a") @@ -290,7 +281,6 @@ def test_scalar_isin_literal_map_keys(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_scalar_contains_key_scalar(con): mapping = ibis.literal({"a": 1, "b": 2}) a = ibis.literal("a") @@ -302,7 +292,6 @@ def test_map_scalar_contains_key_scalar(con): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_scalar_contains_key_column(backend, alltypes, df): value = {"1": "a", "3": "c"} mapping = ibis.literal(value) @@ -314,7 +303,6 @@ def test_map_scalar_contains_key_column(backend, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_column_contains_key_scalar(backend, alltypes, df): expr = ibis.map(ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col])) series = df.apply(lambda row: {row["string_col"]: row["int_col"]}, axis=1) @@ -327,7 +315,6 @@ def test_map_column_contains_key_scalar(backend, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_column_contains_key_column(alltypes): map_expr = ibis.map( ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col]) @@ -352,7 +339,6 @@ def test_literal_map_merge(con): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_literal_map_getitem_broadcast(backend, alltypes, df): value = {"1": "a", "2": "b"} @@ -499,7 +485,6 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): @values @keys @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_get_all_types(con, keys, values): m = ibis.map(ibis.array(keys), ibis.array(values)) for key, val in zip(keys, values): @@ -510,7 +495,6 @@ def test_map_get_all_types(con, keys, values): @keys @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_contains_all_types(con, keys): a = ibis.array(keys) m = ibis.map(a, a) @@ -519,7 +503,6 @@ def test_map_contains_all_types(con, keys): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_literal_map_get_broadcast(backend, alltypes, df): value = {"1": "a", "2": "b"} @@ -571,7 +554,6 @@ def test_map_construct_array_column(con, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_get_with_compatible_value_smaller(con): value = ibis.literal({"A": 1000, "B": 2000}) expr = value.get("C", 3) @@ -580,7 +562,6 @@ def test_map_get_with_compatible_value_smaller(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_get_with_compatible_value_bigger(con): value = ibis.literal({"A": 1, "B": 2}) expr = value.get("C", 3000) @@ -589,7 +570,6 @@ def test_map_get_with_compatible_value_bigger(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_get_with_incompatible_value_different_kind(con): value = ibis.literal({"A": 1000, "B": 2000}) expr = value.get("C", 3.0) @@ -598,7 +578,6 @@ def test_map_get_with_incompatible_value_different_kind(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion @pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_not_nullable(con, null_value): map_type = dt.Map(dt.string, dt.Int16(nullable=False)) @@ -613,7 +592,6 @@ def test_map_get_with_null_on_not_nullable(con, null_value): ["flink"], raises=Py4JJavaError, reason="Flink cannot handle typeless nulls" ) @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_get_with_null_on_null_type_with_null(con, null_value): value = ibis.literal({"A": None, "B": None}) expr = value.get("C", null_value) @@ -626,7 +604,6 @@ def test_map_get_with_null_on_null_type_with_null(con, null_value): ) @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@mark_notyet_datafusion def test_map_get_with_null_on_null_type_with_non_null(con): value = ibis.literal({"A": None, "B": None}) expr = value.get("C", 1) @@ -639,7 +616,6 @@ def test_map_get_with_null_on_null_type_with_non_null(con): reason="`tbl_properties` is required when creating table with schema", ) @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_create_table(con, temp_table): t = con.create_table( temp_table, @@ -654,13 +630,11 @@ def test_map_create_table(con, temp_table): reason="No translation rule for ", ) @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_length(con): expr = ibis.literal(dict(a="A", b="B")).length() assert con.execute(expr) == 2 -@mark_notyet_datafusion def test_map_keys_unnest(backend): expr = backend.map.kv.keys().unnest() result = expr.to_pandas() @@ -668,7 +642,6 @@ def test_map_keys_unnest(backend): @mark_notimpl_risingwave_hstore -@mark_notyet_datafusion def test_map_contains_null(con): expr = ibis.map(["a"], ibis.literal([None], type="array")) assert con.execute(expr.contains("a")) From 976e52017f175534ed258a2475f456c98c946661 Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Thu, 26 Dec 2024 03:10:00 -0600 Subject: [PATCH 3/6] merge upstream changes --- .devcontainer/Dockerfile | 4 ++ .devcontainer/devcontainer.json | 7 ++- .devcontainer/postCreate.sh | 6 ++- ibis/backends/sql/compilers/datafusion.py | 65 ++++++++++++++++------- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a94dbd05b551..06fb3e6e51d2 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -2,6 +2,10 @@ FROM mcr.microsoft.com/vscode/devcontainers/python:3.13 COPY --from=ghcr.io/astral-sh/uv:0.5.9 /uv /uvx /bin/ ARG USERNAME=vscode +RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ + install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | sudo tee -a /etc/apt/sources.list.d/conda.list + RUN apt-get update && \ apt-get install -y --no-install-recommends libgdal-dev && \ rm -rf /var/lib/apt/lists/* diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index bb2c07c9d749..1ef372004f49 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -9,7 +9,12 @@ "openFiles": ["docs/tutorials/getting_started.qmd"] }, "vscode": { - "extensions": ["ms-toolsai.jupyter", "ms-python.python", "quarto.quarto"] + "extensions": [ + "ms-toolsai.jupyter", + "ms-python.python", + "quarto.quarto", + "charliermarsh.ruff" + ] } }, "features": { diff --git a/.devcontainer/postCreate.sh b/.devcontainer/postCreate.sh index 849d0d71da62..c7106b57a7d7 100755 --- a/.devcontainer/postCreate.sh +++ b/.devcontainer/postCreate.sh @@ -1,8 +1,10 @@ #!/bin/sh +source /opt/conda/etc/profile.d/conda.sh + # install ibis -python3 -m pip install ipython +python3 -m pip install ipyflow ipython # avoid using dynamic versioning by grabbing the version from pyproject.toml POETRY_DYNAMIC_VERSIONING_BYPASS="$(yq '.tool.poetry.version' pyproject.toml)" \ - python3 -m pip install -e '.[duckdb,clickhouse,examples,geospatial]' + python3 -m pip install -e '.[clickhouse,datafusion,duckdb,examples]' diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index 768e70738edd..afe9776bc4d8 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -43,7 +43,7 @@ class DataFusionCompiler(SQLGlotCompiler): ops.RowID, ops.Strftime, ops.TimeDelta, - ops.TimestampBucket, + # ops.TimestampBucket, ops.TimestampDelta, ops.TypeOf, ops.StringToDate, @@ -91,7 +91,8 @@ def visit_NonNullLiteral(self, op, *, value, dtype): if dtype.is_decimal(): return self.cast( sg.exp.convert(str(value)), - dt.Decimal(precision=dtype.precision or 38, scale=dtype.scale or 9), + dt.Decimal(precision=dtype.precision or 38, + scale=dtype.scale or 9), ) elif dtype.is_numeric(): if isinstance(value, float): @@ -131,7 +132,8 @@ def visit_Cast(self, op, *, arg, to): if to.is_interval(): unit = to.unit.name.lower() return sg.cast( - self.f.concat(self.cast(arg, dt.string), f" {unit}"), "interval" + self.f.concat(self.cast(arg, dt.string), + f" {unit}"), "interval" ) if to.is_timestamp(): return self._to_timestamp(arg, to) @@ -214,14 +216,16 @@ def visit_LPad(self, op, *, arg, length, pad): return self.if_( length <= self.f.length(arg), arg, - self.f.concat(self.f.repeat(pad, length - self.f.length(arg)), arg), + self.f.concat(self.f.repeat( + pad, length - self.f.length(arg)), arg), ) def visit_RPad(self, op, *, arg, length, pad): return self.if_( length <= self.f.length(arg), arg, - self.f.concat(arg, self.f.repeat(pad, length - self.f.length(arg))), + self.f.concat(arg, self.f.repeat( + pad, length - self.f.length(arg))), ) def visit_ExtractFragment(self, op, *, arg): @@ -351,7 +355,8 @@ def visit_ArrayCollect(self, op, *, arg, where, order_by, include_null, distinct ) if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And(this=cond, expression=where) + where = cond if where is None else sge.And( + this=cond, expression=where) return self.agg.array_agg(arg, where=where, order_by=order_by) def visit_Covariance(self, op, *, left, right, how, where): @@ -407,11 +412,14 @@ def visit_TimestampFromUNIX(self, op, *, arg, unit): def visit_DateFromYMD(self, op, *, year, month, day): return self.cast( self.f.concat( - self.f.lpad(self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), + self.f.lpad( + self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), "-", - self.f.lpad(self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), "-", - self.f.lpad(self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), ), dt.date, ) @@ -421,17 +429,23 @@ def visit_TimestampFromYMDHMS( ): return self.f.to_timestamp_micros( self.f.concat( - self.f.lpad(self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), + self.f.lpad( + self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), "-", - self.f.lpad(self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), "-", - self.f.lpad(self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), "T", - self.f.lpad(self.cast(self.cast(hours, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(hours, dt.int64), dt.string), 2, "0"), ":", - self.f.lpad(self.cast(self.cast(minutes, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(minutes, dt.int64), dt.string), 2, "0"), ":", - self.f.lpad(self.cast(self.cast(seconds, dt.int64), dt.string), 2, "0"), + self.f.lpad( + self.cast(self.cast(seconds, dt.int64), dt.string), 2, "0"), ".000000Z", ) ) @@ -445,19 +459,22 @@ def visit_ArrayIndex(self, op, *, arg, index): def visit_StringConcat(self, op, *, arg): any_args_null = (a.is_(NULL) for a in arg) return self.if_( - sg.or_(*any_args_null), self.cast(NULL, dt.string), self.f.concat(*arg) + sg.or_(*any_args_null), self.cast(NULL, + dt.string), self.f.concat(*arg) ) def visit_First(self, op, *, arg, where, order_by, include_null): if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And(this=cond, expression=where) + where = cond if where is None else sge.And( + this=cond, expression=where) return self.agg.first_value(arg, where=where, order_by=order_by) def visit_Last(self, op, *, arg, where, order_by, include_null): if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And(this=cond, expression=where) + where = cond if where is None else sge.And( + this=cond, expression=where) return self.agg.last_value(arg, where=where, order_by=order_by) def visit_ArgMin(self, op, *, arg, key, where): @@ -500,7 +517,8 @@ def visit_Aggregate(self, op, *, parent, groups, metrics): # datafusion lower cases all column names internally unless quoted so # quoted=True is required here for correctness by_names_quoted = tuple( - sg.column(key, table=getattr(value, "table", None), quoted=quoted) + sg.column(key, table=getattr( + value, "table", None), quoted=quoted) for key, value in groups.items() ) selections = by_names_quoted + metrics @@ -564,5 +582,14 @@ def visit_MapContains(self, op, *, arg, key): # ops.MapMerge: "mapUpdate", ## need to implement this as a visitor node + def visit_TimestampBucket(self, op, *, arg, interval, offset): + # https://datafusion.apache.org/user-guide/sql/scalar_functions.html#date-bin + origin = self.f.cast( + "1970-01-01T00:00:00Z", self.type_mapper.from_ibis(dt.timestamp) + ) + if offset is not None: + origin += offset + return self.f.date_bin(interval, arg, origin) + compiler = DataFusionCompiler() From 40a4937c2550390c30d05c5a7eb093ecd62b740e Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Thu, 26 Dec 2024 03:18:58 -0600 Subject: [PATCH 4/6] revert changes to .devcontainer config --- .devcontainer/Dockerfile | 4 ---- .devcontainer/devcontainer.json | 11 ++++++++--- .devcontainer/postCreate.sh | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 267144b82af3..dcd6098b071a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -2,10 +2,6 @@ FROM mcr.microsoft.com/vscode/devcontainers/python:3.13 COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /uvx /bin/ ARG USERNAME=vscode -RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ - install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ - echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | sudo tee -a /etc/apt/sources.list.d/conda.list - RUN apt-get update && \ apt-get install -y --no-install-recommends libgdal-dev && \ rm -rf /var/lib/apt/lists/* diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 1ef372004f49..7b7d11ff2d06 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,12 +1,17 @@ { - "build": { "dockerfile": "Dockerfile", "context": ".." }, + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, "containerUser": "vscode", "remoteUser": "vscode", "postStartCommand": "git config --global --add safe.directory ${containerWorkspaceFolder}", "workspaceFolder": "/app", "customizations": { "codespaces": { - "openFiles": ["docs/tutorials/getting_started.qmd"] + "openFiles": [ + "docs/tutorials/getting_started.qmd" + ] }, "vscode": { "extensions": [ @@ -29,4 +34,4 @@ "yqVersion": "latest" } } -} +} \ No newline at end of file diff --git a/.devcontainer/postCreate.sh b/.devcontainer/postCreate.sh index c7106b57a7d7..4e1654f480b5 100755 --- a/.devcontainer/postCreate.sh +++ b/.devcontainer/postCreate.sh @@ -3,8 +3,8 @@ source /opt/conda/etc/profile.d/conda.sh # install ibis -python3 -m pip install ipyflow ipython +python3 -m pip install ipython # avoid using dynamic versioning by grabbing the version from pyproject.toml POETRY_DYNAMIC_VERSIONING_BYPASS="$(yq '.tool.poetry.version' pyproject.toml)" \ - python3 -m pip install -e '.[clickhouse,datafusion,duckdb,examples]' + python3 -m pip install -e '.[duckdb,clickhouse,examples,geospatial,datafusion]' From dcb63ac0a9e837e0b949664bd5faac03fca8c7ae Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Thu, 26 Dec 2024 03:38:03 -0600 Subject: [PATCH 5/6] revert changes to devcontainer --- .devcontainer/devcontainer.json | 3 +-- .devcontainer/postCreate.sh | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 7b7d11ff2d06..a7582519e803 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -17,8 +17,7 @@ "extensions": [ "ms-toolsai.jupyter", "ms-python.python", - "quarto.quarto", - "charliermarsh.ruff" + "quarto.quarto" ] } }, diff --git a/.devcontainer/postCreate.sh b/.devcontainer/postCreate.sh index 4e1654f480b5..b3526cb1742d 100755 --- a/.devcontainer/postCreate.sh +++ b/.devcontainer/postCreate.sh @@ -1,7 +1,5 @@ #!/bin/sh -source /opt/conda/etc/profile.d/conda.sh - # install ibis python3 -m pip install ipython From 759a4c48c01d4c72d4198a751450211f56f862b7 Mon Sep 17 00:00:00 2001 From: Venkat Allam Date: Thu, 26 Dec 2024 09:53:37 +0000 Subject: [PATCH 6/6] reformat datafusion file properly --- ibis/backends/sql/compilers/datafusion.py | 54 ++++++++--------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index afe9776bc4d8..86a730eb4a1f 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -91,8 +91,7 @@ def visit_NonNullLiteral(self, op, *, value, dtype): if dtype.is_decimal(): return self.cast( sg.exp.convert(str(value)), - dt.Decimal(precision=dtype.precision or 38, - scale=dtype.scale or 9), + dt.Decimal(precision=dtype.precision or 38, scale=dtype.scale or 9), ) elif dtype.is_numeric(): if isinstance(value, float): @@ -132,8 +131,7 @@ def visit_Cast(self, op, *, arg, to): if to.is_interval(): unit = to.unit.name.lower() return sg.cast( - self.f.concat(self.cast(arg, dt.string), - f" {unit}"), "interval" + self.f.concat(self.cast(arg, dt.string), f" {unit}"), "interval" ) if to.is_timestamp(): return self._to_timestamp(arg, to) @@ -216,16 +214,14 @@ def visit_LPad(self, op, *, arg, length, pad): return self.if_( length <= self.f.length(arg), arg, - self.f.concat(self.f.repeat( - pad, length - self.f.length(arg)), arg), + self.f.concat(self.f.repeat(pad, length - self.f.length(arg)), arg), ) def visit_RPad(self, op, *, arg, length, pad): return self.if_( length <= self.f.length(arg), arg, - self.f.concat(arg, self.f.repeat( - pad, length - self.f.length(arg))), + self.f.concat(arg, self.f.repeat(pad, length - self.f.length(arg))), ) def visit_ExtractFragment(self, op, *, arg): @@ -355,8 +351,7 @@ def visit_ArrayCollect(self, op, *, arg, where, order_by, include_null, distinct ) if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And( - this=cond, expression=where) + where = cond if where is None else sge.And(this=cond, expression=where) return self.agg.array_agg(arg, where=where, order_by=order_by) def visit_Covariance(self, op, *, left, right, how, where): @@ -412,14 +407,11 @@ def visit_TimestampFromUNIX(self, op, *, arg, unit): def visit_DateFromYMD(self, op, *, year, month, day): return self.cast( self.f.concat( - self.f.lpad( - self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), + self.f.lpad(self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), "-", - self.f.lpad( - self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), "-", - self.f.lpad( - self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), ), dt.date, ) @@ -429,23 +421,17 @@ def visit_TimestampFromYMDHMS( ): return self.f.to_timestamp_micros( self.f.concat( - self.f.lpad( - self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), + self.f.lpad(self.cast(self.cast(year, dt.int64), dt.string), 4, "0"), "-", - self.f.lpad( - self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(month, dt.int64), dt.string), 2, "0"), "-", - self.f.lpad( - self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(day, dt.int64), dt.string), 2, "0"), "T", - self.f.lpad( - self.cast(self.cast(hours, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(hours, dt.int64), dt.string), 2, "0"), ":", - self.f.lpad( - self.cast(self.cast(minutes, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(minutes, dt.int64), dt.string), 2, "0"), ":", - self.f.lpad( - self.cast(self.cast(seconds, dt.int64), dt.string), 2, "0"), + self.f.lpad(self.cast(self.cast(seconds, dt.int64), dt.string), 2, "0"), ".000000Z", ) ) @@ -459,22 +445,19 @@ def visit_ArrayIndex(self, op, *, arg, index): def visit_StringConcat(self, op, *, arg): any_args_null = (a.is_(NULL) for a in arg) return self.if_( - sg.or_(*any_args_null), self.cast(NULL, - dt.string), self.f.concat(*arg) + sg.or_(*any_args_null), self.cast(NULL, dt.string), self.f.concat(*arg) ) def visit_First(self, op, *, arg, where, order_by, include_null): if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And( - this=cond, expression=where) + where = cond if where is None else sge.And(this=cond, expression=where) return self.agg.first_value(arg, where=where, order_by=order_by) def visit_Last(self, op, *, arg, where, order_by, include_null): if not include_null: cond = arg.is_(sg.not_(NULL, copy=False)) - where = cond if where is None else sge.And( - this=cond, expression=where) + where = cond if where is None else sge.And(this=cond, expression=where) return self.agg.last_value(arg, where=where, order_by=order_by) def visit_ArgMin(self, op, *, arg, key, where): @@ -517,8 +500,7 @@ def visit_Aggregate(self, op, *, parent, groups, metrics): # datafusion lower cases all column names internally unless quoted so # quoted=True is required here for correctness by_names_quoted = tuple( - sg.column(key, table=getattr( - value, "table", None), quoted=quoted) + sg.column(key, table=getattr(value, "table", None), quoted=quoted) for key, value in groups.items() ) selections = by_names_quoted + metrics