Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[10.0] fix: return pd.Timestamp or pd.Series[datetime64] for date.to_pandas() #8784

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ services:
- druid

oracle:
platform: linux/amd64
image: gvenzl/oracle-free:23.4-slim
environment:
ORACLE_PASSWORD: ibis
Expand Down
8 changes: 8 additions & 0 deletions docs/contribute/02_workflow.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,11 @@ you are going only up).
```bash
$ colima delete
```

### `x86_64` or `amd64` based containers

While starting the containers based on `x86_64` / `amd64`, the architecture flag needs to be set in two places:
1. Add `platform: linux/amd64` for the service in `compose.yaml`.
2. Set the `--arch` flag while starting the VM `colima start --arch x86_64`

For instance, this step is necessary for the `oracle` service in `compose.yaml`. Otherwise, the container will fail shortly after getting started.
2 changes: 1 addition & 1 deletion ibis/backends/dask/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def convert_Date(cls, s, dtype, pandas_type):
else:
s = dd.to_datetime(s)

return s.dt.normalize()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think might want to keep this, but not 100% sure. In theory once you're in this function you shouldn't need to normalize ... in theory :)

return s

@classmethod
def convert_String(cls, s, dtype, pandas_type):
Expand Down
13 changes: 6 additions & 7 deletions ibis/backends/dask/tests/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,12 @@ def test_timestamp_with_timezone_is_inferred_correctly(t):
def test_cast_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()
expected = (
df[column]
.dt.normalize()
.map(lambda x: x.date())
.compute()
.rename(expr.get_name())
)

expected = df[column].compute().rename(expr.get_name())
if expected.dt.tz:
expected = expected.dt.tz_convert("UTC")
expected = expected.dt.tz_localize(None).astype(result.dtype)

tm.assert_series_equal(result, expected, check_index=False)


Expand Down
10 changes: 7 additions & 3 deletions ibis/backends/dask/tests/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,12 @@ def test_timestamp_functions(con, case_func, expected_func):
def test_cast_datetime_strings_to_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()

df_computed = df.compute()
expected = pd.to_datetime(df_computed[column]).map(lambda x: x.date())
expected = pd.to_datetime(df_computed[column])
if expected.dt.tz:
expected = expected.dt.tz_convert("UTC")
expected = expected.dt.tz_localize(None).astype(result.dtype)

tm.assert_series_equal(
result.reset_index(drop=True).rename("tmp"),
Expand Down Expand Up @@ -114,10 +118,10 @@ def test_cast_integer_to_date(t, pandas_df):
expr = t.plain_int64.cast("date")
result = expr.execute()
expected = pd.Series(
pd.to_datetime(pandas_df.plain_int64.values, unit="D").date,
pd.to_datetime(pandas_df.plain_int64.values, unit="D"),
index=pandas_df.index,
name="plain_int64",
)
).astype(result.dtype)
tm.assert_series_equal(result, expected, check_names=False)


Expand Down
8 changes: 4 additions & 4 deletions ibis/backends/oracle/converter.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from __future__ import annotations

import datetime
import pandas as pd

from ibis.formats.pandas import PandasData


class OraclePandasData(PandasData):
@classmethod
def convert_Timestamp_element(cls, dtype):
return datetime.datetime.fromisoformat
return pd.Timestamp.fromisoformat

@classmethod
def convert_Date_element(cls, dtype):
return datetime.date.fromisoformat
return pd.Timestamp.fromisoformat
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is sort of up for grabs given that pandas doesn't have a standard way to represent an array of dates (the _element suffix implies [perhaps not in an obvious way] that this function is being called once per element of an array).

I think it's fine to also change this to using pandas timestamps.


@classmethod
def convert_Time_element(cls, dtype):
return datetime.time.fromisoformat
return pd.Timestamp.fromisoformat
2 changes: 1 addition & 1 deletion ibis/backends/pandas/tests/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_timestamp_with_timezone_is_inferred_correctly(t, df):
def test_cast_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()
expected = df[column].dt.normalize().dt.tz_localize(None).dt.date
expected = df[column].dt.normalize().dt.tz_localize(None).astype(result.dtype)
tm.assert_series_equal(result, expected)


Expand Down
7 changes: 5 additions & 2 deletions ibis/backends/pandas/tests/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ def test_timestamp_functions(case_func, expected_func):
def test_cast_datetime_strings_to_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()
expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None).dt.date
expected = pd.to_datetime(df[column]).dt.normalize()
if expected.dt.tz:
expected = expected.dt.tz_convert("UTC")
expected = expected.dt.tz_localize(None).astype(result.dtype)
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -104,7 +107,7 @@ def test_cast_integer_to_date(t, df):
expr = t.plain_int64.cast("date")
result = expr.execute()
expected = pd.Series(
pd.to_datetime(df.plain_int64.values, unit="D").date,
pd.to_datetime(df.plain_int64.values, unit="D").astype(result.dtype),
index=df.index,
name="plain_int64",
)
Expand Down
8 changes: 4 additions & 4 deletions ibis/backends/snowflake/converter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

import datetime
import json
from typing import TYPE_CHECKING

import pandas as pd
import pyarrow as pa

from ibis.formats.pandas import PandasData
Expand Down Expand Up @@ -52,15 +52,15 @@ def __arrow_ext_scalar_class__(self):
class SnowflakePandasData(PandasData):
@classmethod
def convert_Timestamp_element(cls, dtype):
return datetime.datetime.fromisoformat
return pd.Timestamp.fromisoformat

@classmethod
def convert_Date_element(cls, dtype):
return datetime.date.fromisoformat
return pd.Timestamp.fromisoformat

@classmethod
def convert_Time_element(cls, dtype):
return datetime.time.fromisoformat
return pd.Timestamp.fromisoformat

@classmethod
def convert_JSON(cls, s, dtype, pandas_type):
Expand Down
2 changes: 2 additions & 0 deletions ibis/backends/sqlite/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def test_type_map(db):
sol = pd.DataFrame(
{"str_col": ["a"], "date_col": pd.Series([date(2022, 1, 1)], dtype="object")}
)
sol["date_col"] = sol["date_col"].astype(res["date_col"].dtype)

assert res.equals(sol)


Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,7 +1172,7 @@ def test_string_quantile(alltypes, func):
)
def test_date_quantile(alltypes, func):
expr = func(alltypes.timestamp_col.date())
result = expr.execute()
result = expr.execute().to_pydatetime().date()
assert result == date(2009, 12, 31)


Expand Down
56 changes: 33 additions & 23 deletions ibis/backends/tests/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,9 +729,7 @@ def convert_to_offset(x):
"ignore", category=(UserWarning, pd.errors.PerformanceWarning)
)
expected = (
pd.to_datetime(df.date_string_col)
.add(offset)
.map(lambda ts: ts.normalize().date(), na_action="ignore")
pd.to_datetime(df.date_string_col).add(offset).astype("datetime64[s]")
)

expected = backend.default_series_rename(expected)
Expand Down Expand Up @@ -817,12 +815,7 @@ def convert_to_offset(x):
),
param(
lambda t, _: t.timestamp_col.date() + ibis.interval(days=4),
lambda t, _: (
t.timestamp_col.dt.floor("d")
.add(pd.Timedelta(days=4))
.dt.normalize()
.dt.date
),
lambda t, _: t.timestamp_col.dt.floor("d").add(pd.Timedelta(days=4)),
id="date-add-interval",
marks=[
pytest.mark.notimpl(
Expand All @@ -831,16 +824,19 @@ def convert_to_offset(x):
reason="'StringColumn' object has no attribute 'date'",
),
pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError),
pytest.mark.broken(
["oracle"],
raises=AssertionError,
reason=(
"Oracle includes hour:min:sec in the result for "
"CAST(t0.timestamp_col AS DATE), while other backends don't."
),
),
],
),
param(
lambda t, _: t.timestamp_col.date() - ibis.interval(days=14),
lambda t, _: (
t.timestamp_col.dt.floor("d")
.sub(pd.Timedelta(days=14))
.dt.normalize()
.dt.date
),
lambda t, _: t.timestamp_col.dt.floor("d").sub(pd.Timedelta(days=14)),
id="date-subtract-interval",
marks=[
pytest.mark.notimpl(
Expand All @@ -849,6 +845,14 @@ def convert_to_offset(x):
reason="'StringColumn' object has no attribute 'date'",
),
pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError),
pytest.mark.broken(
["oracle"],
raises=AssertionError,
reason=(
"Oracle includes hour:min:sec in the result for "
"CAST(t0.timestamp_col AS DATE), while other backends don't."
),
),
],
),
param(
Expand Down Expand Up @@ -1253,19 +1257,28 @@ def test_interval_add_cast_scalar(backend, alltypes):
reason="'StringColumn' object has no attribute 'date'",
)
@pytest.mark.broken(["flink"], raises=AssertionError, reason="incorrect results")
@pytest.mark.broken(
["oracle"],
raises=AssertionError,
reason=(
"Oracle includes hour:min:sec in the result for "
"CAST(t0.timestamp_col AS DATE), while other backends don't."
),
)
def test_interval_add_cast_column(backend, alltypes, df):
timestamp_date = alltypes.timestamp_col.date()
delta = alltypes.bigint_col.cast("interval('D')")
expr = alltypes["id", (timestamp_date + delta).name("tmp")]
result = expr.execute().sort_values("id").reset_index().tmp

df = df.sort_values("id").reset_index(drop=True)
expected = (
df["timestamp_col"]
.dt.normalize()
.add(df.bigint_col.astype("timedelta64[D]"))
.rename("tmp")
.dt.date
)

backend.assert_series_equal(result, expected.astype(result.dtype))


Expand Down Expand Up @@ -2538,7 +2551,7 @@ def test_time_literal_sql(dialect, snapshot, micros):
),
pytest.mark.notyet(["datafusion"], raises=Exception),
pytest.mark.broken(
["pandas", "dask"],
["dask", "pandas", "pyspark"],
condition=is_older_than("pandas", "2.0.0"),
raises=ValueError,
reason="Out of bounds nanosecond timestamp: 9999-01-02 00:00:00",
Expand All @@ -2557,7 +2570,7 @@ def test_time_literal_sql(dialect, snapshot, micros):
),
pytest.mark.notyet(["datafusion"], raises=Exception),
pytest.mark.broken(
["pandas", "dask"],
["dask", "pandas", "pyspark"],
condition=is_older_than("pandas", "2.0.0"),
raises=ValueError,
reason="Out of bounds nanosecond timestamp: 1-07-17 00:00:00",
Expand All @@ -2580,10 +2593,7 @@ def test_time_literal_sql(dialect, snapshot, micros):
)
def test_date_scalar(con, value, func):
expr = ibis.date(func(value)).name("tmp")

result = con.execute(expr)

assert not isinstance(result, datetime.datetime)
assert isinstance(result, datetime.date)

assert result == datetime.date.fromisoformat(value)
assert isinstance(result, pd.Timestamp)
assert result == pd.Timestamp.fromisoformat(value)
12 changes: 6 additions & 6 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,16 +937,16 @@ def date(value_or_year, month=None, day=None, /):
Create a date scalar from a string

>>> ibis.date("2023-01-02")
┌───────────────────────────┐
datetime.date(2023, 1, 2) │
└───────────────────────────┘
┌──────────────────────────────────
Timestamp('2023-01-02 00:00:00') │
└──────────────────────────────────

Create a date scalar from year, month, and day

>>> ibis.date(2023, 1, 2)
┌───────────────────────────┐
datetime.date(2023, 1, 2) │
└───────────────────────────┘
┌──────────────────────────────────
Timestamp('2023-01-02 00:00:00') │
└──────────────────────────────────

Create a date column from year, month, and day

Expand Down
13 changes: 8 additions & 5 deletions ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,17 +224,20 @@ def convert_Timestamp(cls, s, dtype, pandas_type):
def convert_Date(cls, s, dtype, pandas_type):
if isinstance(s.dtype, pd.DatetimeTZDtype):
s = s.dt.tz_convert("UTC").dt.tz_localize(None)

try:
return s.astype(pandas_type).dt.date
return s.astype(pandas_type)
except (ValueError, TypeError, pd._libs.tslibs.OutOfBoundsDatetime):

def try_date(v):
if isinstance(v, datetime.datetime):
return v.date()
if isinstance(v, datetime.date):
return pd.Timestamp(v)
elif isinstance(v, str):
if v.endswith("Z"):
return datetime.datetime.fromisoformat(v[:-1]).date()
return datetime.date.fromisoformat(v)
datetime_obj = datetime.datetime.fromisoformat(v[:-1])
else:
datetime_obj = datetime.datetime.fromisoformat(v)
return pd.Timestamp(datetime_obj)
else:
return v

Expand Down
Loading