Skip to content

fix: parse timezone-aware datetime strings as UTC consistently across backends #2166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 7, 2025
4 changes: 2 additions & 2 deletions .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-minimum-versions
run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system
run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata backports.zoneinfo --system
- name: install-reqs
run: |
uv pip install -e ".[tests]" --system
Expand Down Expand Up @@ -62,7 +62,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-pretty-old-versions
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata backports.zoneinfo --system
- name: install-reqs
run: uv pip install -e ".[tests]" --system
- name: show-deps
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
cache-dependency-glob: "pyproject.toml"
- name: install-reqs
# Python3.8 is technically at end-of-life, so we don't test everything
run: uv pip install -e ".[tests, core]" --system
run: uv pip install -e ".[tests, core]" backports.zoneinfo --system
- name: show-deps
run: uv pip freeze
- name: Run pytest
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_duckdb/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def replace(self: Self, pattern: str, value: str, *, literal: bool, n: int) -> N

def to_datetime(self: Self, format: str | None) -> DuckDBExpr: # noqa: A002
if format is None:
msg = "Cannot infer format with DuckDB backend"
msg = "Cannot infer format with DuckDB backend, please specify `format` explicitly."
raise NotImplementedError(msg)

return self._compliant_expr._from_call(
Expand Down
4 changes: 4 additions & 0 deletions narwhals/_duckdb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType:
return dtypes.Date()
if duckdb_dtype == "TIMESTAMP":
return dtypes.Datetime()
if duckdb_dtype == "TIMESTAMP WITH TIME ZONE":
# TODO(marco): is UTC correct, or should we be getting the connection timezone?
# https://github.com/narwhals-dev/narwhals/issues/2165
return dtypes.Datetime(time_zone="UTC")
if duckdb_dtype == "BOOLEAN":
return dtypes.Boolean()
if duckdb_dtype == "INTERVAL":
Expand Down
16 changes: 14 additions & 2 deletions narwhals/_pandas_like/series_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,23 @@ def split(self: Self, by: str) -> PandasLikeSeries:
)

def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002
return self._compliant_series._from_native_series(
to_datetime(self._compliant_series._implementation)(
if format is not None and any(x in format for x in ("%z", "Z")):
# We know that the inputs are timezone-aware, so we can directly pass
# `utc=True` for better performance.
return self._compliant_series._from_native_series(
to_datetime(self._compliant_series._implementation, utc=True)(
self._compliant_series._native_series, format=format
)
)
result = self._compliant_series._from_native_series(
to_datetime(self._compliant_series._implementation, utc=False)(
self._compliant_series._native_series, format=format
)
)
result_time_zone = result.dtype.time_zone # type: ignore[attr-defined]
if result_time_zone is not None and result_time_zone != "UTC":
result = result.dt.convert_time_zone("UTC")
return result

def to_uppercase(self: Self) -> PandasLikeSeries:
return self._compliant_series._from_native_series(
Expand Down
10 changes: 6 additions & 4 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

import functools
import re
import warnings
from contextlib import suppress
from functools import lru_cache
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
Expand Down Expand Up @@ -356,7 +356,7 @@ def rename(
return obj.rename(*args, **kwargs, copy=False) # type: ignore[attr-defined]


@lru_cache(maxsize=16)
@functools.lru_cache(maxsize=16)
def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType:
dtypes = import_dtypes_module(version)
if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
Expand Down Expand Up @@ -679,9 +679,11 @@ def align_series_full_broadcast(
return reindexed


def to_datetime(implementation: Implementation) -> Any:
def to_datetime(implementation: Implementation, *, utc: bool) -> Any:
if implementation in PANDAS_LIKE_IMPLEMENTATION:
return implementation.to_native_namespace().to_datetime
return functools.partial(
implementation.to_native_namespace().to_datetime, utc=utc
)

else: # pragma: no cover
msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}"
Expand Down
2 changes: 2 additions & 0 deletions narwhals/_spark_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ def native_to_narwhals_dtype(
if isinstance(dtype, native.TimestampNTZType):
return dtypes.Datetime()
if isinstance(dtype, native.TimestampType):
# TODO(marco): is UTC correct, or should we be getting the connection timezone?
# https://github.com/narwhals-dev/narwhals/issues/2165
return dtypes.Datetime(time_zone="UTC")
if isinstance(dtype, native.DecimalType):
return dtypes.Decimal()
Expand Down
13 changes: 7 additions & 6 deletions narwhals/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,13 @@ def tail(self: Self, n: int = 5) -> ExprT:
def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002
"""Convert to Datetime dtype.

Notes:
- pandas defaults to nanosecond time unit, Polars to microsecond.
Prior to pandas 2.0, nanoseconds were the only time unit supported
in pandas, with no ability to set any other one. The ability to
set the time unit in pandas, if the version permits, will arrive.
- timezone-aware strings are all converted to and parsed as UTC.

Warning:
As different backends auto-infer format in different ways, if `format=None`
there is no guarantee that the result will be equal.
Expand All @@ -381,12 +388,6 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002
Returns:
A new expression.

Notes:
pandas defaults to nanosecond time unit, Polars to microsecond.
Prior to pandas 2.0, nanoseconds were the only time unit supported
in pandas, with no ability to set any other one. The ability to
set the time unit in pandas, if the version permits, will arrive.

Examples:
>>> import polars as pl
>>> import narwhals as nw
Expand Down
9 changes: 5 additions & 4 deletions narwhals/series_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,10 +377,11 @@ def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002
"""Parse Series with strings to a Series with Datetime dtype.

Notes:
pandas defaults to nanosecond time unit, Polars to microsecond.
Prior to pandas 2.0, nanoseconds were the only time unit supported
in pandas, with no ability to set any other one. The ability to
set the time unit in pandas, if the version permits, will arrive.
- pandas defaults to nanosecond time unit, Polars to microsecond.
Prior to pandas 2.0, nanoseconds were the only time unit supported
in pandas, with no ability to set any other one. The ability to
set the time unit in pandas, if the version permits, will arrive.
- timezone-aware strings are all converted to and parsed as UTC.

Warning:
As different backends auto-infer format in different ways, if `format=None`
Expand Down
52 changes: 49 additions & 3 deletions tests/expr_and_series/str/to_datetime_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from contextlib import nullcontext as does_not_raise
from datetime import datetime
from datetime import timezone
from typing import TYPE_CHECKING
Expand All @@ -9,7 +10,10 @@

import narwhals.stable.v1 as nw
from narwhals._arrow.utils import parse_datetime_format
from tests.utils import PANDAS_VERSION
from tests.utils import PYARROW_VERSION
from tests.utils import assert_equal_data
from tests.utils import is_pyarrow_windows_no_tzdata

if TYPE_CHECKING:
from tests.utils import Constructor
Expand All @@ -28,10 +32,16 @@ def test_to_datetime(constructor: Constructor) -> None:
nw.from_native(constructor(data))
.lazy()
.select(b=nw.col("a").str.to_datetime(format="%Y-%m-%dT%H:%M:%S"))
.collect()
.item(row=0, column="b")
)
assert str(result) == expected
result_schema = result.collect_schema()
assert isinstance(result_schema["b"], nw.Datetime)
if "sqlframe" in str(constructor):
# https://github.com/eakmanrq/sqlframe/issues/326
assert result_schema["b"].time_zone == "UTC" # pyright: ignore[reportAttributeAccessIssue]
else:
assert result_schema["b"].time_zone is None # pyright: ignore[reportAttributeAccessIssue]
result_item = result.collect().item(row=0, column="b")
assert str(result_item) == expected


def test_to_datetime_series(constructor_eager: ConstructorEager) -> None:
Expand Down Expand Up @@ -190,3 +200,39 @@ def test_pyarrow_infer_datetime_raise_inconsistent_date_fmt(
) -> None:
with pytest.raises(ValueError, match="Unable to infer datetime format. "):
parse_datetime_format(pa.chunked_array([data]))


@pytest.mark.parametrize("format", [None, "%Y-%m-%dT%H:%M:%S%z"])
def test_to_datetime_tz_aware(
constructor: Constructor,
request: pytest.FixtureRequest,
format: str | None, # noqa: A002
) -> None:
if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (13,):
# bugged
pytest.skip()
if "pandas" in str(constructor) and PANDAS_VERSION < (1,):
# "Cannot pass a tz argument when parsing strings with timezone information."
pytest.skip()
if is_pyarrow_windows_no_tzdata(constructor):
pytest.skip()
if "sqlframe" in str(constructor):
# https://github.com/eakmanrq/sqlframe/issues/325
request.applymarker(pytest.mark.xfail)
context = (
pytest.raises(NotImplementedError)
if any(x in str(constructor) for x in ("duckdb", "sqlframe")) and format is None
else does_not_raise()
)
df = nw.from_native(constructor({"a": ["2020-01-01T01:02:03+0100"]}))
with context:
result = df.with_columns(b=nw.col("a").str.to_datetime(format))
assert isinstance(result.collect_schema()["b"], nw.Datetime)
result_schema = result.lazy().collect().schema
assert result_schema["a"] == nw.String
assert isinstance(result_schema["b"], nw.Datetime)
expected = {
"a": ["2020-01-01T01:02:03+0100"],
"b": [datetime(2020, 1, 1, 0, 2, 3, tzinfo=timezone.utc)],
}
assert_equal_data(result, expected)
Loading