From 44f81067aa961eb1a6191c09007548dfd09cc1cc Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Tue, 3 Dec 2024 21:20:17 +0800 Subject: [PATCH 01/11] Deal with timezone-aware datetime dtypes --- pygmt/clib/conversion.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 5a1d1cf51b9..d907e1ab80b 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -193,6 +193,13 @@ def _to_numpy(data: Any) -> np.ndarray: numpy_dtype = np.float64 data = data.to_numpy(na_value=np.nan) + # Deal with timezone-aware datetime dtypes. + if getattr(dtype, "tz", None): # pandas.DatetimeTZDtype + numpy_dtype = getattr(dtype, "base", None) + elif getattr(getattr(dtype, "pyarrow_dtype", None), "tz", None): + # pd.ArrayDtype[pa.Timestamp] + numpy_dtype = getattr(dtype, "numpy_dtype", None) + array = np.ascontiguousarray(data, dtype=numpy_dtype) # Check if a np.object_ array can be converted to np.str_. From 586799945ab7d9af04d7e45ed97a8f7f799c9ff5 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Tue, 3 Dec 2024 21:20:46 +0800 Subject: [PATCH 02/11] Add tests for pandas datetime dtypes --- pygmt/tests/test_clib_to_numpy.py | 86 +++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index 29fc50826ab..e7063f6474e 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -331,6 +331,92 @@ def test_to_numpy_pandas_date(dtype, expected_dtype): ) +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + # NumPy datetime64 types. Only unit 's'/'ms'/'us'/'ns' are supported. + pytest.param("datetime64[s]", "datetime64[s]", id="datetime64[s]"), + pytest.param("datetime64[ms]", "datetime64[ms]", id="datetime64[ms]"), + pytest.param("datetime64[us]", "datetime64[us]", id="datetime64[us]"), + pytest.param("datetime64[ns]", "datetime64[ns]", id="datetime64[ns]"), + # pandas.DatetimeTZDtype can be given in two ways [tz is required]: + # 1. pandas.DatetimeTZDtype(unit, tz) + # 2. String aliases: "datetime64[unit, tz]" + pytest.param("datetime64[s, UTC]", "datetime64[s]", id="datetime64[s, tz=UTC]"), + pytest.param( + "datetime64[s, America/New_York]", + "datetime64[s]", + id="datetime64[s, tz=America/New_York]", + ), + pytest.param( + "datetime64[s, +07:30]", "datetime64[s]", id="datetime64[s, +07:30]" + ), + # PyArrow timestamp types can be given in two ways [tz is optional]: + # 1. pd.ArrowDtype(pyarrow.Timestamp(unit, tz=tz)) + # 2. String aliases: "timestamp[unit, tz][pyarrow]" + pytest.param( + "timestamp[s][pyarrow]", + "datetime64[s]", + id="timestamp[s][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[ms][pyarrow]", + "datetime64[ms]", + id="timestamp[ms][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[us][pyarrow]", + "datetime64[us]", + id="timestamp[us][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[ns][pyarrow]", + "datetime64[ns]", + id="timestamp[ns][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[s, UTC][pyarrow]", + "datetime64[s]", + id="timestamp[s, UTC][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[s, America/New_York][pyarrow]", + "datetime64[s]", + id="timestamp[s, America/New_York][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + pytest.param( + "timestamp[s, +08:00][pyarrow]", + "datetime64[s]", + id="timestamp[s, +08:00][pyarrow]", + marks=skip_if_no(package="pyarrow"), + ), + ], +) +def test_to_numpy_pandas_datetime(dtype, expected_dtype): + """ + Test the _to_numpy function with pandas.Series of datetime types. + """ + series = pd.Series( + [pd.Timestamp("2024-01-02T03:04:05"), pd.Timestamp("2024-01-02T03:04:06")], + dtype=dtype, + ) + result = _to_numpy(series) + _check_result(result, np.datetime64) + assert result.dtype == expected_dtype + + if "," in str(dtype): # A hacky solution to decide if the dtype is timezone-aware. + series = series.dt.tz_convert("UTC") # Convert to UTC if timezone-aware. + expected_series = series.dt.strftime("%Y-%m-%dT%H:%M:%S").to_list() + + npt.assert_array_equal(result, np.array(expected_series, dtype=expected_dtype)) + + ######################################################################################## # Test the _to_numpy function with PyArrow arrays. # From 56a266dcf2b6fb06a2794f9ae7f82f2f042ab8c1 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 15:42:58 +0800 Subject: [PATCH 03/11] Add workaround for pandas 2.0 --- pygmt/clib/conversion.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index d907e1ab80b..e39a131f8fd 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -195,7 +195,12 @@ def _to_numpy(data: Any) -> np.ndarray: # Deal with timezone-aware datetime dtypes. if getattr(dtype, "tz", None): # pandas.DatetimeTZDtype - numpy_dtype = getattr(dtype, "base", None) + if Version(pd.__version__) < Version("2.1"): + # Workaround for bug https://github.com/pandas-dev/pandas/issues/52705. + # Solution from https://github.com/pandas-dev/pandas/pull/52706. + numpy_dtype = np.dtype(f"M8[{dtype.unit}]") + else: + numpy_dtype = getattr(dtype, "base", None) elif getattr(getattr(dtype, "pyarrow_dtype", None), "tz", None): # pd.ArrayDtype[pa.Timestamp] numpy_dtype = getattr(dtype, "numpy_dtype", None) From a47e9eecf469a8eae46ae1a1601415c62afb092f Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 16:39:23 +0800 Subject: [PATCH 04/11] Revert "Add workaround for pandas 2.0" This reverts commit 56a266dcf2b6fb06a2794f9ae7f82f2f042ab8c1. --- pygmt/clib/conversion.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index e39a131f8fd..d907e1ab80b 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -195,12 +195,7 @@ def _to_numpy(data: Any) -> np.ndarray: # Deal with timezone-aware datetime dtypes. if getattr(dtype, "tz", None): # pandas.DatetimeTZDtype - if Version(pd.__version__) < Version("2.1"): - # Workaround for bug https://github.com/pandas-dev/pandas/issues/52705. - # Solution from https://github.com/pandas-dev/pandas/pull/52706. - numpy_dtype = np.dtype(f"M8[{dtype.unit}]") - else: - numpy_dtype = getattr(dtype, "base", None) + numpy_dtype = getattr(dtype, "base", None) elif getattr(getattr(dtype, "pyarrow_dtype", None), "tz", None): # pd.ArrayDtype[pa.Timestamp] numpy_dtype = getattr(dtype, "numpy_dtype", None) From b7047c0a81685fd46ca998f85b04e41880712de8 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 17:34:31 +0800 Subject: [PATCH 05/11] Add workaround for pandas 2.0 with pyarrow.Timestamp dtypes --- pygmt/clib/conversion.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index d907e1ab80b..b86ce7016b7 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -196,9 +196,12 @@ def _to_numpy(data: Any) -> np.ndarray: # Deal with timezone-aware datetime dtypes. if getattr(dtype, "tz", None): # pandas.DatetimeTZDtype numpy_dtype = getattr(dtype, "base", None) - elif getattr(getattr(dtype, "pyarrow_dtype", None), "tz", None): + elif getattr(dtype, "pyarrow_dtype", None) and hasattr(dtype.pyarrow_dtype, "tz"): # pd.ArrayDtype[pa.Timestamp] numpy_dtype = getattr(dtype, "numpy_dtype", None) + if Version(pd.__version__) < Version("2.1"): + # In pandas 2.0, dtype.numpy_type is dtype("O"). + numpy_dtype = np.dtype(f"M8[{dtype.pyarrow_dtype.unit}]") array = np.ascontiguousarray(data, dtype=numpy_dtype) From c18877932a8f19b1f64634b1ff7af790be2e8a52 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 17:35:50 +0800 Subject: [PATCH 06/11] For pandas 2.0, skip tests for pandas dtypes and add workarounds for pyarrow dtypes --- pygmt/tests/test_clib_to_numpy.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index e7063f6474e..066a7a2845f 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -331,6 +331,12 @@ def test_to_numpy_pandas_date(dtype, expected_dtype): ) +pandas_old_version = pytest.mark.xfail( + condition=Version(pd.__version__) < Version("2.1"), + reason="pandas 2.0 bug reported in https://github.com/pandas-dev/pandas/issues/52705", +) + + @pytest.mark.parametrize( ("dtype", "expected_dtype"), [ @@ -342,14 +348,23 @@ def test_to_numpy_pandas_date(dtype, expected_dtype): # pandas.DatetimeTZDtype can be given in two ways [tz is required]: # 1. pandas.DatetimeTZDtype(unit, tz) # 2. String aliases: "datetime64[unit, tz]" - pytest.param("datetime64[s, UTC]", "datetime64[s]", id="datetime64[s, tz=UTC]"), + pytest.param( + "datetime64[s, UTC]", + "datetime64[s]", + id="datetime64[s, tz=UTC]", + marks=pandas_old_version, + ), pytest.param( "datetime64[s, America/New_York]", "datetime64[s]", id="datetime64[s, tz=America/New_York]", + marks=pandas_old_version, ), pytest.param( - "datetime64[s, +07:30]", "datetime64[s]", id="datetime64[s, +07:30]" + "datetime64[s, +07:30]", + "datetime64[s]", + id="datetime64[s, +07:30]", + marks=pandas_old_version, ), # PyArrow timestamp types can be given in two ways [tz is optional]: # 1. pd.ArrowDtype(pyarrow.Timestamp(unit, tz=tz)) @@ -364,13 +379,13 @@ def test_to_numpy_pandas_date(dtype, expected_dtype): "timestamp[ms][pyarrow]", "datetime64[ms]", id="timestamp[ms][pyarrow]", - marks=skip_if_no(package="pyarrow"), + marks=[skip_if_no(package="pyarrow"), pandas_old_version], ), pytest.param( "timestamp[us][pyarrow]", "datetime64[us]", id="timestamp[us][pyarrow]", - marks=skip_if_no(package="pyarrow"), + marks=[skip_if_no(package="pyarrow"), pandas_old_version], ), pytest.param( "timestamp[ns][pyarrow]", @@ -411,7 +426,11 @@ def test_to_numpy_pandas_datetime(dtype, expected_dtype): assert result.dtype == expected_dtype if "," in str(dtype): # A hacky solution to decide if the dtype is timezone-aware. - series = series.dt.tz_convert("UTC") # Convert to UTC if timezone-aware. + if Version(pd.__version__) < Version("2.1") and dtype.startswith("timestamp"): + # pandas 2.0 doesn't have the dt.tz_convert method for pyarrow.Timestamp. + series = pd.to_datetime(series, utc=True) + else: + series = series.dt.tz_convert("UTC") # Convert to UTC if timezone-aware. expected_series = series.dt.strftime("%Y-%m-%dT%H:%M:%S").to_list() npt.assert_array_equal(result, np.array(expected_series, dtype=expected_dtype)) From 18057c1cbcbb32a17a081910e8bd6545e743ee6e Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 17:49:24 +0800 Subject: [PATCH 07/11] Improve the checking of dtypes --- pygmt/clib/conversion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index b86ce7016b7..e3531d04709 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -194,10 +194,10 @@ def _to_numpy(data: Any) -> np.ndarray: data = data.to_numpy(na_value=np.nan) # Deal with timezone-aware datetime dtypes. - if getattr(dtype, "tz", None): # pandas.DatetimeTZDtype + if isinstance(dtype, pd.DatetimeTZDtype): # pandas.DatetimeTZDtype numpy_dtype = getattr(dtype, "base", None) - elif getattr(dtype, "pyarrow_dtype", None) and hasattr(dtype.pyarrow_dtype, "tz"): - # pd.ArrayDtype[pa.Timestamp] + elif isinstance(dtype, pd.ArrowDtype) and hasattr(dtype.pyarrow_dtype, "tz"): + # pd.ArrowDtype[pa.Timestamp] numpy_dtype = getattr(dtype, "numpy_dtype", None) if Version(pd.__version__) < Version("2.1"): # In pandas 2.0, dtype.numpy_type is dtype("O"). From fb1450978a621f365e19dc8c08243c3ff012e0e7 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 17:50:58 +0800 Subject: [PATCH 08/11] Fix a type hint issue --- pygmt/clib/conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index e3531d04709..6fbebcd8383 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -201,7 +201,7 @@ def _to_numpy(data: Any) -> np.ndarray: numpy_dtype = getattr(dtype, "numpy_dtype", None) if Version(pd.__version__) < Version("2.1"): # In pandas 2.0, dtype.numpy_type is dtype("O"). - numpy_dtype = np.dtype(f"M8[{dtype.pyarrow_dtype.unit}]") + numpy_dtype = np.dtype(f"M8[{dtype.pyarrow_dtype.unit}]") # type: ignore[assignment, attr-defined] array = np.ascontiguousarray(data, dtype=numpy_dtype) From ac01592f55f79b92db940daee0be62caf8306f4d Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 12 Dec 2024 18:25:13 +0800 Subject: [PATCH 09/11] Try if dt.tz_localize works for Windows --- pygmt/tests/test_clib_to_numpy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index 066a7a2845f..eaa086a50de 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -425,14 +425,15 @@ def test_to_numpy_pandas_datetime(dtype, expected_dtype): _check_result(result, np.datetime64) assert result.dtype == expected_dtype - if "," in str(dtype): # A hacky solution to decide if the dtype is timezone-aware. + # Convert to UTC if the dtype is timezone-aware + if "," in str(dtype): # A hacky way to decide if the dtype is timezone-aware. if Version(pd.__version__) < Version("2.1") and dtype.startswith("timestamp"): # pandas 2.0 doesn't have the dt.tz_convert method for pyarrow.Timestamp. series = pd.to_datetime(series, utc=True) else: - series = series.dt.tz_convert("UTC") # Convert to UTC if timezone-aware. - expected_series = series.dt.strftime("%Y-%m-%dT%H:%M:%S").to_list() - + series = series.dt.tz_convert("UTC") + # Remove time zone information and preserve local time. + expected_series = series.dt.tz_localize(tz=None) npt.assert_array_equal(result, np.array(expected_series, dtype=expected_dtype)) From 5be936b424e2ee3d296d463432c95635b533f2b4 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 9 Jan 2025 10:21:53 +0800 Subject: [PATCH 10/11] Add a TODO comment for pandas 2.0 workaround --- pygmt/clib/conversion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 8c3e1784ad2..52eec0d2479 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -198,6 +198,7 @@ def _to_numpy(data: Any) -> np.ndarray: elif isinstance(dtype, pd.ArrowDtype) and hasattr(dtype.pyarrow_dtype, "tz"): # pd.ArrowDtype[pa.Timestamp] numpy_dtype = getattr(dtype, "numpy_dtype", None) + # TODO(pandas>=2.1): Remove the workaround for pandas<2.1. if Version(pd.__version__) < Version("2.1"): # In pandas 2.0, dtype.numpy_type is dtype("O"). numpy_dtype = np.dtype(f"M8[{dtype.pyarrow_dtype.unit}]") # type: ignore[assignment, attr-defined] From c179f1c9b2c8e9af7377204547c0667c06ed37c2 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 9 Jan 2025 10:24:52 +0800 Subject: [PATCH 11/11] Add one more TODO comment in tests --- pygmt/tests/test_clib_to_numpy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index f5feb14a6b4..31b6c2421e8 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -461,6 +461,7 @@ def test_to_numpy_pandas_datetime(dtype, expected_dtype): # Convert to UTC if the dtype is timezone-aware if "," in str(dtype): # A hacky way to decide if the dtype is timezone-aware. + # TODO(pandas>=2.1): Simplify the if-else statement. if Version(pd.__version__) < Version("2.1") and dtype.startswith("timestamp"): # pandas 2.0 doesn't have the dt.tz_convert method for pyarrow.Timestamp. series = pd.to_datetime(series, utc=True)