Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/pyarrow/tests/parquet/test_datetime.py
+++ b/venv/lib/python3.10/site-packages/pyarrow/tests/parquet/test_datetime.py
@@ -0,0 +1,461 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import io
+import warnings
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import _check_roundtrip
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_datetime_tz():
+    # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+    # so we need to cast the pandas dtype. Pandas v1 will always silently
+    # coerce to [ns] due to lack of non-[ns] support.
+    s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
+    s = s.dt.tz_localize('utc')
+    s.index = s
+
+    # Both a column and an index to hit both use cases
+    df = pd.DataFrame({'tz_aware': s,
+                       'tz_eastern': s.dt.tz_convert('US/Eastern')},
+                      index=s)
+
+    f = io.BytesIO()
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    _write_table(arrow_table, f)
+    f.seek(0)
+
+    table_read = pq.read_pandas(f)
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_datetime_timezone_tzinfo():
+    value = datetime.datetime(2018, 1, 1, 1, 23, 45,
+                              tzinfo=datetime.timezone.utc)
+    df = pd.DataFrame({'foo': [value]})
+
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps(tempdir):
+    from collections import OrderedDict
+
+    # ARROW-622
+    arrays = OrderedDict()
+    fields = [pa.field('datetime64',
+                       pa.list_(pa.timestamp('ms')))]
+    arrays['datetime64'] = [
+        np.array(['2007-07-13T01:23:34.123456789',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+        None,
+        None,
+        np.array(['2007-07-13T02',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.schema(fields)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    df_expected = df.copy()
+    for i, x in enumerate(df_expected['datetime64']):
+        if isinstance(x, np.ndarray):
+            df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
+
+    tm.assert_frame_equal(df_expected, df_read)
+
+    with pytest.raises(ValueError):
+        _write_table(arrow_table, filename, version='2.6',
+                     coerce_timestamps='unknown')
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps_truncated(tempdir):
+    """
+    ARROW-2555: Test that we can truncate timestamps when coercing if
+    explicitly allowed.
+    """
+    dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1, microsecond=1)
+    dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1)
+
+    fields_us = [pa.field('datetime64', pa.timestamp('us'))]
+    arrays_us = {'datetime64': [dt_us, dt_ms]}
+
+    df_us = pd.DataFrame(arrays_us)
+    schema_us = pa.schema(fields_us)
+
+    filename = tempdir / 'pandas_truncated.parquet'
+    table_us = pa.Table.from_pandas(df_us, schema=schema_us)
+
+    _write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
+                 allow_truncated_timestamps=True)
+    table_ms = _read_table(filename)
+    df_ms = table_ms.to_pandas()
+
+    arrays_expected = {'datetime64': [dt_ms, dt_ms]}
+    df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]')
+    tm.assert_frame_equal(df_expected, df_ms)
+
+
+@pytest.mark.pandas
+def test_date_time_types(tempdir):
+    t1 = pa.date32()
+    data1 = np.array([17259, 17260, 17261], dtype='int32')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.date64()
+    data2 = data1.astype('int64') * 86400000
+    a2 = pa.array(data2, type=t2)
+
+    t3 = pa.timestamp('us')
+    start = pd.Timestamp('2001-01-01').value / 1000
+    data3 = np.array([start, start + 1, start + 2], dtype='int64')
+    a3 = pa.array(data3, type=t3)
+
+    t4 = pa.time32('ms')
+    data4 = np.arange(3, dtype='i4')
+    a4 = pa.array(data4, type=t4)
+
+    t5 = pa.time64('us')
+    a5 = pa.array(data4.astype('int64'), type=t5)
+
+    t6 = pa.time32('s')
+    a6 = pa.array(data4, type=t6)
+
+    ex_t6 = pa.time32('ms')
+    ex_a6 = pa.array(data4 * 1000, type=ex_t6)
+
+    t7 = pa.timestamp('ns')
+    start = pd.Timestamp('2001-01-01').value
+    data7 = np.array([start, start + 1000, start + 2000],
+                     dtype='int64')
+    a7 = pa.array(data7, type=t7)
+
+    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
+                                 ['date32', 'date64', 'timestamp[us]',
+                                  'time32[s]', 'time64[us]',
+                                  'time32_from64[s]',
+                                  'timestamp[ns]'])
+
+    # date64 as date32
+    # time32[s] to time32[ms]
+    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
+                                    ['date32', 'date64', 'timestamp[us]',
+                                     'time32[s]', 'time64[us]',
+                                     'time32_from64[s]',
+                                     'timestamp[ns]'])
+
+    _check_roundtrip(table, expected=expected, version='2.6')
+
+    t0 = pa.timestamp('ms')
+    data0 = np.arange(4, dtype='int64')
+    a0 = pa.array(data0, type=t0)
+
+    t1 = pa.timestamp('us')
+    data1 = np.arange(4, dtype='int64')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.timestamp('ns')
+    data2 = np.arange(4, dtype='int64')
+    a2 = pa.array(data2, type=t2)
+
+    table = pa.Table.from_arrays([a0, a1, a2],
+                                 ['ts[ms]', 'ts[us]', 'ts[ns]'])
+    expected = pa.Table.from_arrays([a0, a1, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int64 for all timestamps supported by default
+    filename = tempdir / 'int64_timestamps.parquet'
+    _write_table(table, filename, version='2.6')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT64'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    t0_ns = pa.timestamp('ns')
+    data0_ns = np.array(data0 * 1000000, dtype='int64')
+    a0_ns = pa.array(data0_ns, type=t0_ns)
+
+    t1_ns = pa.timestamp('ns')
+    data1_ns = np.array(data1 * 1000, dtype='int64')
+    a1_ns = pa.array(data1_ns, type=t1_ns)
+
+    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int96 nanosecond timestamps produced upon request
+    filename = tempdir / 'explicit_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 use_deprecated_int96_timestamps=True)
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    # int96 nanosecond timestamps implied by flavor 'spark'
+    filename = tempdir / 'spark_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 flavor='spark')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_coerce_int96_timestamp_unit(unit):
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
+    names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
+    read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
+    _check_roundtrip(table, expected,
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+    _check_roundtrip(table, expected, version='2.6',
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
+def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
+
+    def get_table(pq_reader_method, filename, **kwargs):
+        if pq_reader_method == "ParquetFile":
+            return pq.ParquetFile(filename, **kwargs).read()
+        elif pq_reader_method == "read_table":
+            return pq.read_table(filename, **kwargs)
+
+    # Recreating the initial JIRA issue referenced in ARROW-12096
+    oob_dts = [
+        datetime.datetime(1000, 1, 1),
+        datetime.datetime(2000, 1, 1),
+        datetime.datetime(3000, 1, 1)
+    ]
+    df = pd.DataFrame({"a": oob_dts})
+    table = pa.table(df)
+
+    filename = tempdir / "test_round_trip_overflow.parquet"
+    pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
+                   version="1.0")
+
+    # with the default resolution of ns, we get wrong values for INT96
+    # that are out of bounds for nanosecond range
+    tab_error = get_table(pq_reader_method, filename)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore",
+                                "Discarding nonzero nanoseconds in conversion",
+                                UserWarning)
+        assert tab_error["a"].to_pylist() != oob_dts
+
+    # avoid this overflow by specifying the resolution to use for INT96 values
+    tab_correct = get_table(
+        pq_reader_method, filename, coerce_int96_timestamp_unit="s"
+    )
+    df_correct = tab_correct.to_pandas(timestamp_as_object=True)
+    df["a"] = df["a"].astype(object)
+    tm.assert_frame_equal(df, df_correct)
+
+
+@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
+def test_timestamp_restore_timezone(unit):
+    # ARROW-5888, restore timezone from serialized metadata
+    ty = pa.timestamp(unit, tz='America/New_York')
+    arr = pa.array([1, 2, 3], type=ty)
+    t = pa.table([arr], names=['f0'])
+    _check_roundtrip(t)
+
+
+def test_timestamp_restore_timezone_nanosecond():
+    # ARROW-9634, also restore timezone for nanosecond data that get stored
+    # as microseconds in the parquet file for Parquet ver 2.4 and less
+    ty = pa.timestamp('ns', tz='America/New_York')
+    arr = pa.array([1000, 2000, 3000], type=ty)
+    table = pa.table([arr], names=['f0'])
+    ty_us = pa.timestamp('us', tz='America/New_York')
+    expected = pa.table([arr.cast(ty_us)], names=['f0'])
+    _check_roundtrip(table, expected=expected, version='2.4')
+
+
+@pytest.mark.pandas
+def test_list_of_datetime_time_roundtrip():
+    # ARROW-4135
+    times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
+                            '11:30', '12:00'], format="%H:%M")
+    df = pd.DataFrame({'time': [times.time]})
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
+@pytest.mark.pandas
+def test_parquet_version_timestamp_differences():
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    all_versions = ['1.0', '2.4', '2.6']
+
+    names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
+    # and nanoseconds should be coerced to microseconds by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
+    _check_roundtrip(table, expected, version='1.0')
+    _check_roundtrip(table, expected, version='2.4')
+
+    # Using Parquet version 2.6, seconds should be coerced to milliseconds
+    # and nanoseconds should be retained by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
+    _check_roundtrip(table, expected, version='2.6')
+
+    # For either Parquet version coercing to milliseconds or microseconds
+    # is allowed
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
+
+    expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
+
+    # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
+    # following should pass ...
+
+    # Using Parquet version 1.0, coercing to nanoseconds is not allowed
+    # expected = None
+    # with pytest.raises(NotImplementedError):
+    #     _roundtrip_table(table, coerce_timestamps='ns')
+
+    # Using Parquet version 2.0, coercing to nanoseconds is allowed
+    # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    # _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver,
+                         use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+def test_noncoerced_nanoseconds_written_without_exception(tempdir):
+    # ARROW-1957: the Parquet version 2.0 writer preserves Arrow
+    # nanosecond timestamps by default
+    n = 9
+    df = pd.DataFrame({'x': range(n)},
+                      index=pd.date_range('2017-01-01', freq='ns', periods=n))
+    tb = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'written.parquet'
+    try:
+        pq.write_table(tb, filename, version='2.6')
+    except Exception:
+        pass
+    assert filename.exists()
+
+    recovered_table = pq.read_table(filename)
+    assert tb.equals(recovered_table)
+
+    # Loss of data through coercion (without explicit override) still an error
+    filename = tempdir / 'not_written.parquet'
+    with pytest.raises(ValueError):
+        pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
+
+
+def test_duration_type():
+    # ARROW-6780
+    arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
+              for unit in ["s", "ms", "us", "ns"]]
+    table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
+
+    _check_roundtrip(table)