Initial commit
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = [
|
||||
pytest.mark.parquet,
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,179 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests import util
|
||||
|
||||
|
||||
def _write_table(table, path, **kwargs):
|
||||
# So we see the ImportError somewhere
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.pandas_compat import _pandas_api
|
||||
|
||||
if _pandas_api.is_data_frame(table):
|
||||
table = pa.Table.from_pandas(table)
|
||||
|
||||
pq.write_table(table, path, **kwargs)
|
||||
return table
|
||||
|
||||
|
||||
def _read_table(*args, **kwargs):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
table = pq.read_table(*args, **kwargs)
|
||||
table.validate(full=True)
|
||||
return table
|
||||
|
||||
|
||||
def _roundtrip_table(table, read_table_kwargs=None,
|
||||
write_table_kwargs=None):
|
||||
read_table_kwargs = read_table_kwargs or {}
|
||||
write_table_kwargs = write_table_kwargs or {}
|
||||
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, **write_table_kwargs)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
return _read_table(reader, **read_table_kwargs)
|
||||
|
||||
|
||||
def _check_roundtrip(table, expected=None, read_table_kwargs=None,
|
||||
**write_table_kwargs):
|
||||
if expected is None:
|
||||
expected = table
|
||||
|
||||
read_table_kwargs = read_table_kwargs or {}
|
||||
|
||||
# intentionally check twice
|
||||
result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
|
||||
write_table_kwargs=write_table_kwargs)
|
||||
assert result.schema == expected.schema
|
||||
assert result.equals(expected)
|
||||
result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
|
||||
write_table_kwargs=write_table_kwargs)
|
||||
assert result.schema == expected.schema
|
||||
assert result.equals(expected)
|
||||
|
||||
|
||||
def _roundtrip_pandas_dataframe(df, write_kwargs):
|
||||
table = pa.Table.from_pandas(df)
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=write_kwargs)
|
||||
return result.to_pandas()
|
||||
|
||||
|
||||
def _random_integers(size, dtype):
|
||||
# We do not generate integers outside the int64 range
|
||||
platform_int_info = np.iinfo('int_')
|
||||
iinfo = np.iinfo(dtype)
|
||||
return np.random.randint(max(iinfo.min, platform_int_info.min),
|
||||
min(iinfo.max, platform_int_info.max),
|
||||
size=size, dtype=dtype)
|
||||
|
||||
|
||||
def _range_integers(size, dtype):
|
||||
return pa.array(np.arange(size, dtype=dtype))
|
||||
|
||||
|
||||
def _test_dict(size=10000, seed=0):
|
||||
np.random.seed(seed)
|
||||
return {
|
||||
'uint8': _random_integers(size, np.uint8),
|
||||
'uint16': _random_integers(size, np.uint16),
|
||||
'uint32': _random_integers(size, np.uint32),
|
||||
'uint64': _random_integers(size, np.uint64),
|
||||
'int8': _random_integers(size, np.int8),
|
||||
'int16': _random_integers(size, np.int16),
|
||||
'int32': _random_integers(size, np.int32),
|
||||
'int64': _random_integers(size, np.int64),
|
||||
'float32': np.random.randn(size).astype(np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'strings': [util.rands(10) for i in range(size)],
|
||||
'all_none': [None] * size,
|
||||
'all_none_category': [None] * size
|
||||
}
|
||||
|
||||
|
||||
def _test_dataframe(size=10000, seed=0):
|
||||
import pandas as pd
|
||||
|
||||
df = pd.DataFrame(_test_dict(size, seed))
|
||||
|
||||
# TODO(PARQUET-1015)
|
||||
# df['all_none_category'] = df['all_none_category'].astype('category')
|
||||
return df
|
||||
|
||||
|
||||
def _test_table(size=10000, seed=0):
|
||||
return pa.Table.from_pydict(_test_dict(size, seed))
|
||||
|
||||
|
||||
def make_sample_file(table_or_df):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
if isinstance(table_or_df, pa.Table):
|
||||
a_table = table_or_df
|
||||
else:
|
||||
a_table = pa.Table.from_pandas(table_or_df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, compression='SNAPPY', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
return pq.ParquetFile(buf)
|
||||
|
||||
|
||||
def alltypes_sample(size=10000, seed=0, categorical=False):
|
||||
import pandas as pd
|
||||
|
||||
np.random.seed(seed)
|
||||
arrays = {
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float16': np.arange(size, dtype=np.float16),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
|
||||
dtype='datetime64[ms]'),
|
||||
'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
|
||||
dtype='datetime64[us]'),
|
||||
'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
|
||||
dtype='datetime64[ns]'),
|
||||
'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
|
||||
'str': pd.Series([str(x) for x in range(size)]),
|
||||
'empty_str': [''] * size,
|
||||
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
|
||||
'null': [None] * size,
|
||||
'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
|
||||
}
|
||||
if categorical:
|
||||
arrays['str_category'] = arrays['str'].astype('category')
|
||||
return pd.DataFrame(arrays)
|
||||
@@ -0,0 +1,105 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from pyarrow.util import guid
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def datadir(base_datadir):
|
||||
return base_datadir / 'parquet'
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def parquet_test_datadir():
|
||||
if sys.platform == 'emscripten':
|
||||
pytest.skip("needs PARQUET_TEST_DATA files access")
|
||||
result = os.environ.get('PARQUET_TEST_DATA')
|
||||
if not result:
|
||||
raise RuntimeError('Please point the PARQUET_TEST_DATA environment '
|
||||
'variable to the test data directory')
|
||||
return pathlib.Path(result)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_bucket(s3_server):
|
||||
boto3 = pytest.importorskip('boto3')
|
||||
botocore = pytest.importorskip('botocore')
|
||||
s3_bucket_name = 'test-s3fs'
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
s3_client = boto3.client(
|
||||
's3',
|
||||
endpoint_url=f'http://{host}:{port}',
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key,
|
||||
config=botocore.client.Config(signature_version='s3v4'),
|
||||
region_name='us-east-1'
|
||||
)
|
||||
|
||||
try:
|
||||
s3_client.create_bucket(Bucket=s3_bucket_name)
|
||||
except Exception:
|
||||
pass # we get BucketAlreadyOwnedByYou error with fsspec handler
|
||||
finally:
|
||||
s3_client.close()
|
||||
|
||||
return s3_bucket_name
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_example_s3fs(s3_server, s3_bucket):
|
||||
s3fs = pytest.importorskip('s3fs')
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
fs = s3fs.S3FileSystem(
|
||||
key=access_key,
|
||||
secret=secret_key,
|
||||
client_kwargs={
|
||||
'endpoint_url': f'http://{host}:{port}'
|
||||
}
|
||||
)
|
||||
|
||||
test_path = f'{s3_bucket}/{guid()}'
|
||||
|
||||
fs.mkdir(test_path)
|
||||
yield fs, test_path
|
||||
try:
|
||||
fs.rm(test_path, recursive=True)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_example_fs(s3_server):
|
||||
from pyarrow.fs import FileSystem
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
uri = (
|
||||
f"s3://{access_key}:{secret_key}@mybucket/data.parquet?scheme=http"
|
||||
f"&endpoint_override={host}:{port}&allow_bucket_creation=True"
|
||||
)
|
||||
fs, path = FileSystem.from_uri(uri)
|
||||
|
||||
fs.create_dir("mybucket")
|
||||
|
||||
yield fs, uri, path
|
||||
@@ -0,0 +1,61 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import base64
|
||||
|
||||
import pyarrow.parquet.encryption as pe
|
||||
|
||||
|
||||
class InMemoryKmsClient(pe.KmsClient):
|
||||
"""This is a mock class implementation of KmsClient, built for testing
|
||||
only.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""Create an InMemoryKmsClient instance."""
|
||||
pe.KmsClient.__init__(self)
|
||||
self.master_keys_map = config.custom_kms_conf
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
"""Not a secure cipher - the wrapped key
|
||||
is just the master key concatenated with key bytes"""
|
||||
master_key_bytes = self.master_keys_map[master_key_identifier].encode(
|
||||
'utf-8')
|
||||
wrapped_key = b"".join([master_key_bytes, key_bytes])
|
||||
result = base64.b64encode(wrapped_key)
|
||||
return result
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
"""Not a secure cipher - just extract the key from
|
||||
the wrapped key"""
|
||||
expected_master_key = self.master_keys_map[master_key_identifier]
|
||||
decoded_wrapped_key = base64.b64decode(wrapped_key)
|
||||
master_key_bytes = decoded_wrapped_key[:16]
|
||||
decrypted_key = decoded_wrapped_key[16:]
|
||||
if (expected_master_key == master_key_bytes.decode('utf-8')):
|
||||
return decrypted_key
|
||||
raise ValueError("Incorrect master key used",
|
||||
master_key_bytes, decrypted_key)
|
||||
|
||||
|
||||
def verify_file_encrypted(path):
|
||||
"""Verify that the file is encrypted by looking at its first 4 bytes.
|
||||
If it's the magic string PARE
|
||||
then this is a parquet with encrypted footer."""
|
||||
with open(path, "rb") as file:
|
||||
magic_str = file.read(4)
|
||||
# Verify magic string for parquet with encrypted footer is PARE
|
||||
assert magic_str == b'PARE'
|
||||
@@ -0,0 +1,995 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
import io
|
||||
import warnings
|
||||
from shutil import copytree
|
||||
from decimal import Decimal
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow import fs
|
||||
from pyarrow.tests import util
|
||||
from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
|
||||
_test_table)
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.pandas_examples import dataframe_with_lists
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
def test_parquet_invalid_version(tempdir):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
with pytest.raises(ValueError, match="Unsupported Parquet format version"):
|
||||
_write_table(table, tempdir / 'test_version.parquet', version="2.2")
|
||||
with pytest.raises(ValueError, match="Unsupported Parquet data page " +
|
||||
"version"):
|
||||
_write_table(table, tempdir / 'test_version.parquet',
|
||||
data_page_version="2.2")
|
||||
|
||||
|
||||
def test_set_data_page_size():
|
||||
arr = pa.array([1, 2, 3] * 100000)
|
||||
t = pa.Table.from_arrays([arr], names=['f0'])
|
||||
|
||||
# 128K, 512K
|
||||
page_sizes = [2 << 16, 2 << 18]
|
||||
for target_page_size in page_sizes:
|
||||
_check_roundtrip(t, data_page_size=target_page_size)
|
||||
|
||||
|
||||
@pytest.mark.numpy
|
||||
def test_set_write_batch_size():
|
||||
table = _test_table(100)
|
||||
|
||||
_check_roundtrip(
|
||||
table, data_page_size=10, write_batch_size=1, version='2.4'
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.numpy
|
||||
def test_set_dictionary_pagesize_limit():
|
||||
table = _test_table(100)
|
||||
|
||||
_check_roundtrip(table, dictionary_pagesize_limit=1,
|
||||
data_page_size=10, version='2.4')
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
_check_roundtrip(table, dictionary_pagesize_limit="a",
|
||||
data_page_size=10, version='2.4')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_chunked_table_write():
|
||||
# ARROW-232
|
||||
tables = []
|
||||
batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10))
|
||||
tables.append(pa.Table.from_batches([batch] * 3))
|
||||
df, _ = dataframe_with_lists()
|
||||
batch = pa.RecordBatch.from_pandas(df)
|
||||
tables.append(pa.Table.from_batches([batch] * 3))
|
||||
|
||||
for data_page_version in ['1.0', '2.0']:
|
||||
for use_dictionary in [True, False]:
|
||||
for table in tables:
|
||||
_check_roundtrip(
|
||||
table, version='2.6',
|
||||
data_page_version=data_page_version,
|
||||
use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_memory_map(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_check_roundtrip(table, read_table_kwargs={'memory_map': True},
|
||||
version='2.6')
|
||||
|
||||
filename = str(tempdir / 'tmp_file')
|
||||
with open(filename, 'wb') as f:
|
||||
_write_table(table, f, version='2.6')
|
||||
table_read = pq.read_pandas(filename, memory_map=True)
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_enable_buffered_stream(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
|
||||
version='2.6')
|
||||
|
||||
filename = str(tempdir / 'tmp_file')
|
||||
with open(filename, 'wb') as f:
|
||||
_write_table(table, f, version='2.6')
|
||||
table_read = pq.read_pandas(filename, buffer_size=4096)
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
def test_special_chars_filename(tempdir):
|
||||
table = pa.Table.from_arrays([pa.array([42])], ["ints"])
|
||||
filename = "foo # bar"
|
||||
path = tempdir / filename
|
||||
assert not path.exists()
|
||||
_write_table(table, str(path))
|
||||
assert path.exists()
|
||||
table_read = _read_table(str(path))
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
def test_invalid_source():
|
||||
# Test that we provide an helpful error message pointing out
|
||||
# that None wasn't expected when trying to open a Parquet None file.
|
||||
with pytest.raises(TypeError, match="None"):
|
||||
pq.read_table(None)
|
||||
|
||||
with pytest.raises(TypeError, match="None"):
|
||||
pq.ParquetFile(None)
|
||||
|
||||
|
||||
def test_read_table_without_dataset(tempdir):
|
||||
from unittest import mock
|
||||
|
||||
class MockParquetDataset:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise ImportError("MockParquetDataset")
|
||||
|
||||
path = tempdir / "test.parquet"
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
_write_table(table, path)
|
||||
|
||||
with mock.patch('pyarrow.parquet.core.ParquetDataset', new=MockParquetDataset):
|
||||
with pytest.raises(ValueError, match="the 'filters' keyword"):
|
||||
pq.read_table(path, filters=[('integer', '=', 1)])
|
||||
with pytest.raises(ValueError, match="the 'partitioning' keyword"):
|
||||
pq.read_table(path, partitioning=['week', 'color'])
|
||||
with pytest.raises(ValueError, match="the 'schema' argument"):
|
||||
pq.read_table(path, schema=table.schema)
|
||||
# Error message varies depending on OS
|
||||
with pytest.raises(OSError):
|
||||
pq.read_table(tempdir)
|
||||
result = pq.read_table(path)
|
||||
assert result == table
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_file_with_over_int16_max_row_groups():
|
||||
# PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
|
||||
# limit on the number of row groups, but this limit only impacts files with
|
||||
# encrypted row group metadata because of the int16 row group ordinal used
|
||||
# in the Parquet Thrift metadata. Unencrypted files are not impacted, so
|
||||
# this test checks that it works (even if it isn't a good idea)
|
||||
t = pa.table([list(range(40000))], names=['f0'])
|
||||
_check_roundtrip(t, row_group_size=1)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_empty_table_roundtrip():
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
# Create a non-empty table to infer the types correctly, then slice to 0
|
||||
table = pa.Table.from_pandas(df)
|
||||
table = pa.Table.from_arrays(
|
||||
[col.chunk(0)[:0] for col in table.itercolumns()],
|
||||
names=table.schema.names)
|
||||
|
||||
assert table.schema.field('null').type == pa.null()
|
||||
assert table.schema.field('null_list').type == pa.list_(pa.null())
|
||||
_check_roundtrip(
|
||||
table, version='2.6')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_empty_table_no_columns():
|
||||
df = pd.DataFrame()
|
||||
empty = pa.Table.from_pandas(df, preserve_index=False)
|
||||
_check_roundtrip(empty)
|
||||
|
||||
|
||||
def test_write_nested_zero_length_array_chunk_failure():
|
||||
# Bug report in ARROW-3792
|
||||
cols = OrderedDict(
|
||||
int32=pa.int32(),
|
||||
list_string=pa.list_(pa.string())
|
||||
)
|
||||
data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
|
||||
|
||||
# This produces a table with a column like
|
||||
# <Column name='list_string' type=ListType(list<item: string>)>
|
||||
# [
|
||||
# [],
|
||||
# [
|
||||
# [
|
||||
# "G"
|
||||
# ]
|
||||
# ]
|
||||
# ]
|
||||
#
|
||||
# Each column is a ChunkedArray with 2 elements
|
||||
my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
|
||||
for batch in data]
|
||||
my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols))
|
||||
for batch in my_arrays]
|
||||
tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
|
||||
_check_roundtrip(tbl)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multiple_path_types(tempdir):
|
||||
# Test compatibility with PEP 519 path-like objects
|
||||
path = tempdir / 'zzz.parquet'
|
||||
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
|
||||
_write_table(df, path)
|
||||
table_read = _read_table(path)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
# Test compatibility with plain string paths
|
||||
path = str(tempdir) + 'zzz.parquet'
|
||||
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
|
||||
_write_table(df, path)
|
||||
table_read = _read_table(path)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
def test_fspath(tempdir):
|
||||
# ARROW-12472 support __fspath__ objects without using str()
|
||||
path = tempdir / "test.parquet"
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
_write_table(table, path)
|
||||
|
||||
fs_protocol_obj = util.FSProtocolClass(path)
|
||||
|
||||
result = _read_table(fs_protocol_obj)
|
||||
assert result.equals(table)
|
||||
|
||||
# combined with non-local filesystem raises
|
||||
with pytest.raises(TypeError):
|
||||
_read_table(fs_protocol_obj, filesystem=fs.FileSystem())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None, fs.LocalFileSystem()
|
||||
])
|
||||
@pytest.mark.parametrize("name", ("data.parquet", "例.parquet"))
|
||||
def test_relative_paths(tempdir, filesystem, name):
|
||||
# reading and writing from relative paths
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
path = tempdir / name
|
||||
|
||||
# reading
|
||||
pq.write_table(table, str(path))
|
||||
with util.change_cwd(tempdir):
|
||||
result = pq.read_table(name, filesystem=filesystem)
|
||||
assert result.equals(table)
|
||||
|
||||
path.unlink()
|
||||
assert not path.exists()
|
||||
|
||||
# writing
|
||||
with util.change_cwd(tempdir):
|
||||
pq.write_table(table, name, filesystem=filesystem)
|
||||
result = pq.read_table(path)
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_read_non_existing_file():
|
||||
# ensure we have a proper error message
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pq.read_table('i-am-not-existing.parquet')
|
||||
|
||||
|
||||
def test_file_error_python_exception():
|
||||
class BogusFile(io.BytesIO):
|
||||
def read(self, *args):
|
||||
raise ZeroDivisionError("zorglub")
|
||||
|
||||
def seek(self, *args):
|
||||
raise ZeroDivisionError("zorglub")
|
||||
|
||||
# ensure the Python exception is restored
|
||||
with pytest.raises(ZeroDivisionError, match="zorglub"):
|
||||
pq.read_table(BogusFile(b""))
|
||||
|
||||
|
||||
def test_parquet_read_from_buffer(tempdir):
|
||||
# reading from a buffer from python's open()
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
|
||||
with open(str(tempdir / "data.parquet"), "rb") as f:
|
||||
result = pq.read_table(f)
|
||||
assert result.equals(table)
|
||||
|
||||
with open(str(tempdir / "data.parquet"), "rb") as f:
|
||||
result = pq.read_table(pa.PythonFile(f))
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_byte_stream_split():
|
||||
# This is only a smoke test.
|
||||
arr_float = pa.array(list(map(float, range(100))))
|
||||
arr_int = pa.array(list(map(int, range(100))))
|
||||
arr_bool = pa.array([True, False] * 50)
|
||||
data_float = [arr_float, arr_float]
|
||||
table = pa.Table.from_arrays(data_float, names=['a', 'b'])
|
||||
|
||||
# Check with byte_stream_split for both columns.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=False, use_byte_stream_split=True)
|
||||
|
||||
# Check with byte_stream_split for column 'b' and dictionary
|
||||
# for column 'a'.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=['a'],
|
||||
use_byte_stream_split=['b'])
|
||||
|
||||
# Check with a collision for both columns.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=['a', 'b'],
|
||||
use_byte_stream_split=['a', 'b'])
|
||||
|
||||
# Check with mixed column types.
|
||||
mixed_table = pa.Table.from_arrays([arr_float, arr_float, arr_int, arr_int],
|
||||
names=['a', 'b', 'c', 'd'])
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=['b', 'd'],
|
||||
use_byte_stream_split=['a', 'c'])
|
||||
|
||||
# Try to use the wrong data type with the byte_stream_split encoding.
|
||||
# This should throw an exception.
|
||||
table = pa.Table.from_arrays([arr_bool], names=['tmp'])
|
||||
with pytest.raises(IOError, match='BYTE_STREAM_SPLIT only supports'):
|
||||
_check_roundtrip(table, expected=table, use_byte_stream_split=True,
|
||||
use_dictionary=False)
|
||||
|
||||
|
||||
def test_store_decimal_as_integer(tempdir):
|
||||
arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(5, 2))
|
||||
arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(16, 9))
|
||||
arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(22, 2))
|
||||
arr_bool = pa.array([True, False] * 50)
|
||||
data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18]
|
||||
table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c'])
|
||||
|
||||
# Check with store_decimal_as_integer.
|
||||
_check_roundtrip(table,
|
||||
expected=table,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
# Check physical type in parquet schema
|
||||
pqtestfile_path = os.path.join(tempdir, 'test.parquet')
|
||||
pq.write_table(table, pqtestfile_path,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
pqtestfile = pq.ParquetFile(pqtestfile_path)
|
||||
pqcol_decimal_1_9 = pqtestfile.schema.column(0)
|
||||
pqcol_decimal_10_18 = pqtestfile.schema.column(1)
|
||||
|
||||
assert pqcol_decimal_1_9.physical_type == 'INT32'
|
||||
assert pqcol_decimal_10_18.physical_type == 'INT64'
|
||||
|
||||
# Check with store_decimal_as_integer and delta-int encoding.
|
||||
# DELTA_BINARY_PACKED requires parquet physical type to be INT64 or INT32
|
||||
_check_roundtrip(table,
|
||||
expected=table,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True,
|
||||
column_encoding={
|
||||
'a': 'DELTA_BINARY_PACKED',
|
||||
'b': 'DELTA_BINARY_PACKED'
|
||||
})
|
||||
|
||||
# Check with mixed column types.
|
||||
mixed_table = pa.Table.from_arrays(
|
||||
[arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18, arr_bool],
|
||||
names=['a', 'b', 'c', 'd'])
|
||||
_check_roundtrip(mixed_table,
|
||||
expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
|
||||
def test_column_encoding():
|
||||
arr_float = pa.array(list(map(float, range(100))))
|
||||
arr_int = pa.array(list(map(int, range(100))))
|
||||
arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary())
|
||||
arr_flba = pa.array(
|
||||
[str(x).zfill(10) for x in range(100)], type=pa.binary(10))
|
||||
arr_bool = pa.array([False, True, False, False] * 25)
|
||||
mixed_table = pa.Table.from_arrays(
|
||||
[arr_float, arr_int, arr_bin, arr_flba, arr_bool],
|
||||
names=['a', 'b', 'c', 'd', 'e'])
|
||||
|
||||
# Check "BYTE_STREAM_SPLIT" for columns 'a', 'b', 'd'
|
||||
# and "PLAIN" column_encoding for column 'c'.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
|
||||
column_encoding={'a': "BYTE_STREAM_SPLIT",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN",
|
||||
'd': "BYTE_STREAM_SPLIT"})
|
||||
|
||||
# Check "PLAIN" for all columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN")
|
||||
|
||||
# Check "DELTA_BINARY_PACKED" for integer columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "DELTA_LENGTH_BYTE_ARRAY"})
|
||||
|
||||
# Check "DELTA_BYTE_ARRAY" for byte columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "DELTA_BYTE_ARRAY",
|
||||
'd': "DELTA_BYTE_ARRAY"})
|
||||
|
||||
# Check "RLE" for boolean columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'e': "RLE"})
|
||||
|
||||
# Try to pass "BYTE_STREAM_SPLIT" column encoding for boolean column 'e'.
|
||||
# This should throw an error as it is does not support BOOLEAN.
|
||||
with pytest.raises(IOError,
|
||||
match="BYTE_STREAM_SPLIT only supports"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'c': "PLAIN",
|
||||
'e': "BYTE_STREAM_SPLIT"})
|
||||
|
||||
# Try to pass use "DELTA_BINARY_PACKED" encoding on float column.
|
||||
# This should throw an error as only integers are supported.
|
||||
with pytest.raises(OSError,
|
||||
match="DELTA_BINARY_PACKED encoder only supports"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "DELTA_BINARY_PACKED",
|
||||
'b': "PLAIN",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass "RLE_DICTIONARY".
|
||||
# This should throw an error as dictionary encoding is already used by
|
||||
# default and not supported to be specified as "fallback" encoding
|
||||
with pytest.raises(ValueError,
|
||||
match="'RLE_DICTIONARY' is already used by default"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding="RLE_DICTIONARY")
|
||||
|
||||
# Try to pass unsupported encoding.
|
||||
with pytest.raises(ValueError,
|
||||
match="Unsupported column encoding: 'MADE_UP_ENCODING'"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "MADE_UP_ENCODING"})
|
||||
|
||||
# Try to pass column_encoding and use_dictionary.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=['b'],
|
||||
column_encoding={'b': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_dictionary=True (default value).
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
column_encoding={'b': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_byte_stream_split on same column.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
use_byte_stream_split=['a'],
|
||||
column_encoding={'a': "RLE",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_byte_stream_split=True.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
use_byte_stream_split=True,
|
||||
column_encoding={'a': "RLE",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding=True.
|
||||
# This should throw an error.
|
||||
with pytest.raises(TypeError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding=True)
|
||||
|
||||
|
||||
def test_compression_level():
|
||||
arr = pa.array(list(map(int, range(1000))))
|
||||
data = [arr, arr]
|
||||
table = pa.Table.from_arrays(data, names=['a', 'b'])
|
||||
|
||||
# Check one compression level.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level=1)
|
||||
|
||||
# Check another one to make sure that compression_level=1 does not
|
||||
# coincide with the default one in Arrow.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level=5)
|
||||
|
||||
# Check that the user can provide a compression per column
|
||||
_check_roundtrip(table, expected=table,
|
||||
compression={'a': "gzip", 'b': "snappy"})
|
||||
|
||||
# Check that the user can provide a compression level per column
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level={'a': 2, 'b': 3})
|
||||
|
||||
# Check if both LZ4 compressors are working
|
||||
# (level < 3 -> fast, level >= 3 -> HC)
|
||||
_check_roundtrip(table, expected=table, compression="lz4",
|
||||
compression_level=1)
|
||||
|
||||
_check_roundtrip(table, expected=table, compression="lz4",
|
||||
compression_level=9)
|
||||
|
||||
# Check that specifying a compression level for a codec which does allow
|
||||
# specifying one, results into an error.
|
||||
# Uncompressed, snappy and lzo do not support specifying a compression
|
||||
# level.
|
||||
# GZIP (zlib) allows for specifying a compression level but as of up
|
||||
# to version 1.2.11 the valid range is [-1, 9].
|
||||
invalid_combinations = [("snappy", 4), ("gzip", -1337),
|
||||
("None", 444), ("lzo", 14)]
|
||||
buf = io.BytesIO()
|
||||
for (codec, level) in invalid_combinations:
|
||||
with pytest.raises((ValueError, OSError)):
|
||||
_write_table(table, buf, compression=codec,
|
||||
compression_level=level)
|
||||
|
||||
|
||||
def test_sanitized_spark_field_names():
|
||||
a0 = pa.array([0, 1, 2, 3, 4])
|
||||
name = 'prohib; ,\t{}'
|
||||
table = pa.Table.from_arrays([a0], [name])
|
||||
|
||||
result = _roundtrip_table(table, write_table_kwargs={'flavor': 'spark'})
|
||||
|
||||
expected_name = 'prohib______'
|
||||
assert result.schema[0].name == expected_name
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multithreaded_read():
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf, compression='SNAPPY', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
table1 = _read_table(buf, use_threads=True)
|
||||
|
||||
buf.seek(0)
|
||||
table2 = _read_table(buf, use_threads=False)
|
||||
|
||||
assert table1.equals(table2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_min_chunksize():
|
||||
data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
|
||||
table = pa.Table.from_pandas(data.reset_index())
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf, chunk_size=-1)
|
||||
|
||||
buf.seek(0)
|
||||
result = _read_table(buf)
|
||||
|
||||
assert result.equals(table)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_write_table(table, buf, chunk_size=0)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_error_deletes_incomplete_file(tempdir):
|
||||
# ARROW-1285
|
||||
df = pd.DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3, freq='ns')})
|
||||
|
||||
pdf = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'tmp_file'
|
||||
try:
|
||||
# Test relies on writing nanoseconds to raise an error
|
||||
# true for Parquet 2.4
|
||||
_write_table(pdf, filename, version="2.4")
|
||||
except pa.ArrowException:
|
||||
pass
|
||||
|
||||
assert not filename.exists()
|
||||
|
||||
|
||||
def test_read_non_existent_file(tempdir):
|
||||
path = 'nonexistent-file.parquet'
|
||||
try:
|
||||
pq.read_table(path)
|
||||
except Exception as e:
|
||||
assert path in e.args[0]
|
||||
|
||||
|
||||
def test_read_table_doesnt_warn(datadir):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter(action="error")
|
||||
pq.read_table(datadir / 'v0.7.1.parquet')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_zlib_compression_bug():
|
||||
# ARROW-3514: "zlib deflate failed, output buffer too small"
|
||||
table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
|
||||
f = io.BytesIO()
|
||||
pq.write_table(table, f, compression='gzip')
|
||||
|
||||
f.seek(0)
|
||||
roundtrip = pq.read_table(f)
|
||||
tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())
|
||||
|
||||
|
||||
def test_parquet_file_too_small(tempdir):
|
||||
path = str(tempdir / "test.parquet")
|
||||
# TODO(dataset) with datasets API it raises OSError instead
|
||||
with pytest.raises((pa.ArrowInvalid, OSError),
|
||||
match='size is 0 bytes'):
|
||||
with open(path, 'wb') as f:
|
||||
pass
|
||||
pq.read_table(path)
|
||||
|
||||
with pytest.raises((pa.ArrowInvalid, OSError),
|
||||
match='size is 4 bytes'):
|
||||
with open(path, 'wb') as f:
|
||||
f.write(b'ffff')
|
||||
pq.read_table(path)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.fastparquet
|
||||
@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
|
||||
def test_fastparquet_cross_compatibility(tempdir):
|
||||
fp = pytest.importorskip('fastparquet')
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"d": [True, False, True],
|
||||
"e": pd.date_range("20130101", periods=3),
|
||||
"f": pd.Categorical(["a", "b", "a"]),
|
||||
# fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
|
||||
# "g": [[1, 2], None, [1, 2, 3]],
|
||||
}
|
||||
)
|
||||
table = pa.table(df)
|
||||
|
||||
# Arrow -> fastparquet
|
||||
file_arrow = str(tempdir / "cross_compat_arrow.parquet")
|
||||
pq.write_table(table, file_arrow, compression=None)
|
||||
|
||||
fp_file = fp.ParquetFile(file_arrow)
|
||||
df_fp = fp_file.to_pandas()
|
||||
tm.assert_frame_equal(df, df_fp)
|
||||
|
||||
# Fastparquet -> arrow
|
||||
file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
|
||||
fp.write(file_fastparquet, df)
|
||||
|
||||
table_fp = pq.read_pandas(file_fastparquet)
|
||||
# for fastparquet written file, categoricals comes back as strings
|
||||
# (no arrow schema in parquet metadata)
|
||||
df['f'] = df['f'].astype(object)
|
||||
tm.assert_frame_equal(table_fp.to_pandas(), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('array_factory', [
|
||||
lambda: pa.array([0, None] * 10),
|
||||
lambda: pa.array([0, None] * 10).dictionary_encode(),
|
||||
lambda: pa.array(["", None] * 10),
|
||||
lambda: pa.array(["", None] * 10).dictionary_encode(),
|
||||
])
|
||||
@pytest.mark.parametrize('read_dictionary', [False, True])
|
||||
def test_buffer_contents(
|
||||
array_factory, read_dictionary
|
||||
):
|
||||
# Test that null values are deterministically initialized to zero
|
||||
# after a roundtrip through Parquet.
|
||||
# See ARROW-8006 and ARROW-8011.
|
||||
orig_table = pa.Table.from_pydict({"col": array_factory()})
|
||||
bio = io.BytesIO()
|
||||
pq.write_table(orig_table, bio, use_dictionary=True)
|
||||
bio.seek(0)
|
||||
read_dictionary = ['col'] if read_dictionary else None
|
||||
table = pq.read_table(bio, use_threads=False,
|
||||
read_dictionary=read_dictionary)
|
||||
|
||||
for col in table.columns:
|
||||
[chunk] = col.chunks
|
||||
buf = chunk.buffers()[1]
|
||||
assert buf.to_pybytes() == buf.size * b"\0"
|
||||
|
||||
|
||||
def test_parquet_compression_roundtrip(tempdir):
|
||||
# ARROW-10480: ensure even with nonstandard Parquet file naming
|
||||
# conventions, writing and then reading a file works. In
|
||||
# particular, ensure that we don't automatically double-compress
|
||||
# the stream due to auto-detecting the extension in the filename
|
||||
table = pa.table([pa.array(range(4))], names=["ints"])
|
||||
path = tempdir / "arrow-10480.pyarrow.gz"
|
||||
pq.write_table(table, path, compression="GZIP")
|
||||
result = pq.read_table(path)
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_empty_row_groups(tempdir):
|
||||
# ARROW-3020
|
||||
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
|
||||
|
||||
path = tempdir / 'empty_row_groups.parquet'
|
||||
|
||||
num_groups = 3
|
||||
with pq.ParquetWriter(path, table.schema) as writer:
|
||||
for i in range(num_groups):
|
||||
writer.write_table(table)
|
||||
|
||||
reader = pq.ParquetFile(path)
|
||||
assert reader.metadata.num_row_groups == num_groups
|
||||
|
||||
for i in range(num_groups):
|
||||
assert reader.read_row_group(i).equals(table)
|
||||
|
||||
|
||||
def test_reads_over_batch(tempdir):
|
||||
data = [None] * (1 << 20)
|
||||
data.append([1])
|
||||
# Large list<int64> with mostly nones and one final
|
||||
# value. This should force batched reads when
|
||||
# reading back.
|
||||
table = pa.Table.from_arrays([data], ['column'])
|
||||
|
||||
path = tempdir / 'arrow-11607.parquet'
|
||||
pq.write_table(table, path)
|
||||
table2 = pq.read_table(path)
|
||||
assert table == table2
|
||||
|
||||
|
||||
def test_permutation_of_column_order(tempdir):
|
||||
# ARROW-2366
|
||||
case = tempdir / "dataset_column_order_permutation"
|
||||
case.mkdir(exist_ok=True)
|
||||
|
||||
data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
|
||||
pq.write_table(data1, case / "data1.parquet")
|
||||
|
||||
data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
|
||||
pq.write_table(data2, case / "data2.parquet")
|
||||
|
||||
table = pq.read_table(str(case))
|
||||
table2 = pa.table([[1, 2, 3, 4, 5, 6],
|
||||
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
|
||||
names=['a', 'b'])
|
||||
|
||||
assert table == table2
|
||||
|
||||
|
||||
def test_thrift_size_limits(tempdir):
|
||||
path = tempdir / 'largethrift.parquet'
|
||||
|
||||
array = pa.array(list(range(10)))
|
||||
num_cols = 1000
|
||||
table = pa.table(
|
||||
[array] * num_cols,
|
||||
names=[f'some_long_column_name_{i}' for i in range(num_cols)])
|
||||
pq.write_table(table, path)
|
||||
|
||||
with pytest.raises(
|
||||
OSError,
|
||||
match="Couldn't deserialize thrift:.*Exceeded size limit"):
|
||||
pq.read_table(path, thrift_string_size_limit=50 * num_cols)
|
||||
with pytest.raises(
|
||||
OSError,
|
||||
match="Couldn't deserialize thrift:.*Exceeded size limit"):
|
||||
pq.read_table(path, thrift_container_size_limit=num_cols)
|
||||
|
||||
got = pq.read_table(path, thrift_string_size_limit=100 * num_cols)
|
||||
assert got == table
|
||||
got = pq.read_table(path, thrift_container_size_limit=2 * num_cols)
|
||||
assert got == table
|
||||
got = pq.read_table(path)
|
||||
assert got == table
|
||||
|
||||
|
||||
def test_page_checksum_verification_write_table(tempdir):
|
||||
"""Check that checksum verification works for datasets created with
|
||||
pq.write_table()"""
|
||||
|
||||
# Write some sample data into a parquet file with page checksum enabled
|
||||
original_path = tempdir / 'correct.parquet'
|
||||
table_orig = pa.table({'a': [1, 2, 3, 4]})
|
||||
pq.write_table(table_orig, original_path, write_page_checksum=True)
|
||||
|
||||
# Read file and verify that the data is correct
|
||||
table_check = pq.read_table(original_path, page_checksum_verification=True)
|
||||
assert table_orig == table_check
|
||||
|
||||
# Read the original file as binary and swap the 31-th and 36-th bytes. This
|
||||
# should be equivalent to storing the following data:
|
||||
# pa.table({'a': [1, 3, 2, 4]})
|
||||
bin_data = bytearray(original_path.read_bytes())
|
||||
|
||||
# Swap two bytes to emulate corruption. Also, check that the two bytes are
|
||||
# different, otherwise no corruption occurs
|
||||
assert bin_data[31] != bin_data[36]
|
||||
bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
|
||||
|
||||
# Write the corrupted data to another parquet file
|
||||
corrupted_path = tempdir / 'corrupted.parquet'
|
||||
corrupted_path.write_bytes(bin_data)
|
||||
|
||||
# Case 1: Reading the corrupted file with read_table() and without page
|
||||
# checksum verification succeeds but yields corrupted data
|
||||
table_corrupt = pq.read_table(corrupted_path,
|
||||
page_checksum_verification=False)
|
||||
# The read should complete without error, but the table has different
|
||||
# content than the original file!
|
||||
assert table_corrupt != table_orig
|
||||
assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 2: Reading the corrupted file with read_table() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = pq.read_table(corrupted_path, page_checksum_verification=True)
|
||||
|
||||
# Case 3: Reading the corrupted file with ParquetFile.read() and without
|
||||
# page checksum verification succeeds but yields corrupted data
|
||||
corrupted_pq_file = pq.ParquetFile(corrupted_path,
|
||||
page_checksum_verification=False)
|
||||
table_corrupt2 = corrupted_pq_file.read()
|
||||
assert table_corrupt2 != table_orig
|
||||
assert table_corrupt2 == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 4: Reading the corrupted file with ParquetFile.read() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
corrupted_pq_file = pq.ParquetFile(corrupted_path,
|
||||
page_checksum_verification=True)
|
||||
# Accessing the data should result in an error
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = corrupted_pq_file.read()
|
||||
|
||||
|
||||
@pytest.mark.dataset
|
||||
def test_checksum_write_to_dataset(tempdir):
|
||||
"""Check that checksum verification works for datasets created with
|
||||
pq.write_to_dataset"""
|
||||
|
||||
table_orig = pa.table({'a': [1, 2, 3, 4]})
|
||||
|
||||
# Write a sample dataset with page checksum enabled
|
||||
original_dir_path = tempdir / 'correct_dir'
|
||||
pq.write_to_dataset(table_orig,
|
||||
original_dir_path,
|
||||
write_page_checksum=True)
|
||||
|
||||
# Read file and verify that the data is correct
|
||||
original_file_path_list = list(original_dir_path.iterdir())
|
||||
assert len(original_file_path_list) == 1
|
||||
original_path = original_file_path_list[0]
|
||||
table_check = pq.read_table(original_path, page_checksum_verification=True)
|
||||
assert table_orig == table_check
|
||||
|
||||
# Read the original file as binary and swap the 31-th and 36-th bytes. This
|
||||
# should be equivalent to storing the following data:
|
||||
# pa.table({'a': [1, 3, 2, 4]})
|
||||
bin_data = bytearray(original_path.read_bytes())
|
||||
|
||||
# Swap two bytes to emulate corruption. Also, check that the two bytes are
|
||||
# different, otherwise no corruption occurs
|
||||
assert bin_data[31] != bin_data[36]
|
||||
bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
|
||||
|
||||
# Write the corrupted data to another parquet dataset
|
||||
# Copy dataset dir (which should be just one file)
|
||||
corrupted_dir_path = tempdir / 'corrupted_dir'
|
||||
copytree(original_dir_path, corrupted_dir_path)
|
||||
# Corrupt just the one file with the dataset
|
||||
corrupted_file_path = corrupted_dir_path / original_path.name
|
||||
corrupted_file_path.write_bytes(bin_data)
|
||||
|
||||
# Case 1: Reading the corrupted file with read_table() and without page
|
||||
# checksum verification succeeds but yields corrupted data
|
||||
table_corrupt = pq.read_table(corrupted_file_path,
|
||||
page_checksum_verification=False)
|
||||
# The read should complete without error, but the table has different
|
||||
# content than the original file!
|
||||
assert table_corrupt != table_orig
|
||||
assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 2: Reading the corrupted file with read_table() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
|
||||
@@ -0,0 +1,109 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table,
|
||||
_check_roundtrip)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
# Tests for ARROW-11497
|
||||
_test_data_simple = [
|
||||
{'items': [1, 2]},
|
||||
{'items': [0]},
|
||||
]
|
||||
|
||||
_test_data_complex = [
|
||||
{'items': [{'name': 'elem1', 'value': '1'},
|
||||
{'name': 'elem2', 'value': '2'}]},
|
||||
{'items': [{'name': 'elem1', 'value': '0'}]},
|
||||
]
|
||||
|
||||
parametrize_test_data = pytest.mark.parametrize(
|
||||
"test_data", [_test_data_simple, _test_data_complex])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@parametrize_test_data
|
||||
def test_write_compliant_nested_type_enable(tempdir, test_data):
|
||||
# prepare dataframe for testing
|
||||
df = pd.DataFrame(data=test_data)
|
||||
# verify that we can read/write pandas df with new flag (default behaviour)
|
||||
_roundtrip_pandas_dataframe(df,
|
||||
write_kwargs={})
|
||||
|
||||
# Write to a parquet file with compliant nested type
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
with pq.ParquetWriter(path, table.schema,
|
||||
version='2.6') as writer:
|
||||
writer.write_table(table)
|
||||
# Read back as a table
|
||||
new_table = _read_table(path)
|
||||
# Validate that "items" columns compliant to Parquet nested format
|
||||
# Should be like this: list<element: struct<name: string, value: string>>
|
||||
assert isinstance(new_table.schema.types[0], pa.ListType)
|
||||
assert new_table.schema.types[0].value_field.name == 'element'
|
||||
|
||||
# Verify that the new table can be read/written correctly
|
||||
_check_roundtrip(new_table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@parametrize_test_data
|
||||
def test_write_compliant_nested_type_disable(tempdir, test_data):
|
||||
# prepare dataframe for testing
|
||||
df = pd.DataFrame(data=test_data)
|
||||
# verify that we can read/write with new flag disabled
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={
|
||||
'use_compliant_nested_type': False})
|
||||
|
||||
# Write to a parquet file while disabling compliant nested type
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
with pq.ParquetWriter(path, table.schema, version='2.6',
|
||||
use_compliant_nested_type=False) as writer:
|
||||
writer.write_table(table)
|
||||
new_table = _read_table(path)
|
||||
|
||||
# Validate that "items" columns is not compliant to Parquet nested format
|
||||
# Should be like this: list<item: struct<name: string, value: string>>
|
||||
assert isinstance(new_table.schema.types[0], pa.ListType)
|
||||
assert new_table.schema.types[0].value_field.name == 'item'
|
||||
|
||||
# Verify that the new table can be read/written correctly
|
||||
_check_roundtrip(new_table,
|
||||
use_compliant_nested_type=False)
|
||||
@@ -0,0 +1,616 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import decimal
|
||||
import io
|
||||
import random
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests import util
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip, _roundtrip_table
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.pandas_examples import (dataframe_with_arrays,
|
||||
dataframe_with_lists)
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
# General roundtrip of data types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('chunk_size', [None, 1000])
|
||||
def test_parquet_2_6_roundtrip(tempdir, chunk_size):
|
||||
df = alltypes_sample(size=10000, categorical=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
table_read = pq.read_pandas(filename)
|
||||
assert table_read.schema.pandas_metadata is not None
|
||||
|
||||
read_metadata = table_read.schema.metadata
|
||||
assert arrow_table.schema.metadata == read_metadata
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_1_0_roundtrip(tempdir):
|
||||
size = 10000
|
||||
np.random.seed(0)
|
||||
df = pd.DataFrame({
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'str': [str(x) for x in range(size)],
|
||||
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
|
||||
'empty_str': [''] * size
|
||||
})
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
_write_table(arrow_table, filename, version='1.0')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
# We pass uint32_t as int64_t if we write Parquet version 1.0
|
||||
df['uint32'] = df['uint32'].values.astype(np.int64)
|
||||
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
# Dictionary
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _simple_table_write_read(table):
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
return pq.read_table(
|
||||
pa.BufferReader(contents)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_direct_read_dictionary():
|
||||
# ARROW-3325
|
||||
repeats = 10
|
||||
nunique = 5
|
||||
|
||||
data = [
|
||||
[util.rands(10) for i in range(nunique)] * repeats,
|
||||
|
||||
]
|
||||
table = pa.table(data, names=['f0'])
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
|
||||
result = pq.read_table(pa.BufferReader(contents),
|
||||
read_dictionary=['f0'])
|
||||
|
||||
# Compute dictionary-encoded subfield
|
||||
expected = pa.table([table[0].dictionary_encode()], names=['f0'])
|
||||
assert result.equals(expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_direct_read_dictionary_subfield():
|
||||
repeats = 10
|
||||
nunique = 5
|
||||
|
||||
data = [
|
||||
[[util.rands(10)] for i in range(nunique)] * repeats,
|
||||
]
|
||||
table = pa.table(data, names=['f0'])
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
result = pq.read_table(pa.BufferReader(contents),
|
||||
read_dictionary=['f0.list.element'])
|
||||
|
||||
arr = pa.array(data[0])
|
||||
values_as_dict = arr.values.dictionary_encode()
|
||||
|
||||
inner_indices = values_as_dict.indices.cast('int32')
|
||||
new_values = pa.DictionaryArray.from_arrays(inner_indices,
|
||||
values_as_dict.dictionary)
|
||||
|
||||
offsets = pa.array(range(51), type='int32')
|
||||
expected_arr = pa.ListArray.from_arrays(offsets, new_values)
|
||||
expected = pa.table([expected_arr], names=['f0'])
|
||||
|
||||
assert result.equals(expected)
|
||||
assert result[0].num_chunks == 1
|
||||
|
||||
|
||||
@pytest.mark.numpy
|
||||
def test_dictionary_array_automatically_read():
|
||||
# ARROW-3246
|
||||
|
||||
# Make a large dictionary, a little over 4MB of data
|
||||
dict_length = 4000
|
||||
dict_values = pa.array([('x' * 1000 + f'_{i}')
|
||||
for i in range(dict_length)])
|
||||
|
||||
num_chunks = 10
|
||||
chunk_size = 100
|
||||
chunks = []
|
||||
for i in range(num_chunks):
|
||||
indices = np.random.randint(0, dict_length,
|
||||
size=chunk_size).astype(np.int32)
|
||||
chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
|
||||
dict_values))
|
||||
|
||||
table = pa.table([pa.chunked_array(chunks)], names=['f0'])
|
||||
result = _simple_table_write_read(table)
|
||||
|
||||
assert result.equals(table)
|
||||
|
||||
# The only key in the metadata was the Arrow schema key
|
||||
assert result.schema.metadata is None
|
||||
|
||||
|
||||
# Decimal
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_decimal_roundtrip(tempdir):
|
||||
num_values = 10
|
||||
|
||||
columns = {}
|
||||
for precision in range(1, 39):
|
||||
for scale in range(0, precision + 1):
|
||||
with util.random_seed(0):
|
||||
random_decimal_values = [
|
||||
util.randdecimal(precision, scale)
|
||||
for _ in range(num_values)
|
||||
]
|
||||
column_name = f'dec_precision_{precision}_scale_{scale}'
|
||||
columns[column_name] = random_decimal_values
|
||||
|
||||
expected = pd.DataFrame(columns)
|
||||
filename = tempdir / 'decimals.parquet'
|
||||
string_filename = str(filename)
|
||||
table = pa.Table.from_pandas(expected)
|
||||
_write_table(table, string_filename)
|
||||
result_table = _read_table(string_filename)
|
||||
result = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.xfail(
|
||||
raises=OSError, reason='Parquet does not support negative scale'
|
||||
)
|
||||
def test_decimal_roundtrip_negative_scale(tempdir):
|
||||
expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
|
||||
filename = tempdir / 'decimals.parquet'
|
||||
string_filename = str(filename)
|
||||
t = pa.Table.from_pandas(expected)
|
||||
_write_table(t, string_filename)
|
||||
result_table = _read_table(string_filename)
|
||||
result = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# List types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', [int, float])
|
||||
def test_single_pylist_column_roundtrip(tempdir, dtype,):
|
||||
filename = tempdir / f'single_{dtype.__name__}_column.parquet'
|
||||
data = [pa.array(list(map(dtype, range(5))))]
|
||||
table = pa.Table.from_arrays(data, names=['a'])
|
||||
_write_table(table, filename)
|
||||
table_read = _read_table(filename)
|
||||
for i in range(table.num_columns):
|
||||
col_written = table[i]
|
||||
col_read = table_read[i]
|
||||
assert table.field(i).name == table_read.field(i).name
|
||||
assert col_read.num_chunks == 1
|
||||
data_written = col_written.chunk(0)
|
||||
data_read = col_read.chunk(0)
|
||||
assert data_written.equals(data_read)
|
||||
|
||||
|
||||
def test_empty_lists_table_roundtrip():
|
||||
# ARROW-2744: Shouldn't crash when writing an array of empty lists
|
||||
arr = pa.array([[], []], type=pa.list_(pa.int32()))
|
||||
table = pa.Table.from_arrays([arr], ["A"])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
def test_nested_list_nonnullable_roundtrip_bug():
|
||||
# Reproduce failure in ARROW-5630
|
||||
typ = pa.list_(pa.field("item", pa.float32(), False))
|
||||
num_rows = 10000
|
||||
t = pa.table([
|
||||
pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
|
||||
(num_rows // 10)), type=typ)
|
||||
], ['a'])
|
||||
_check_roundtrip(
|
||||
t, data_page_size=4096)
|
||||
|
||||
|
||||
def test_nested_list_struct_multiple_batches_roundtrip(tempdir):
|
||||
# Reproduce failure in ARROW-11024
|
||||
data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100
|
||||
table = pa.table([pa.array(data)], names=['column'])
|
||||
_check_roundtrip(
|
||||
table, row_group_size=20)
|
||||
|
||||
# Reproduce failure in ARROW-11069 (plain non-nested structs with strings)
|
||||
data = pa.array(
|
||||
[{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10
|
||||
)
|
||||
table = pa.table({'column': data})
|
||||
_check_roundtrip(table, row_group_size=10)
|
||||
|
||||
|
||||
def test_writing_empty_lists():
|
||||
# ARROW-2591: [Python] Segmentation fault issue in pq.write_table
|
||||
arr1 = pa.array([[], []], pa.list_(pa.int32()))
|
||||
table = pa.Table.from_arrays([arr1], ['list(int32)'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_column_of_arrays(tempdir):
|
||||
df, schema = dataframe_with_arrays()
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_column_of_lists(tempdir):
|
||||
df, schema = dataframe_with_lists(parquet_compatible=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
_write_table(arrow_table, filename, version='2.6')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
def test_large_list_records():
|
||||
# This was fixed in PARQUET-1100
|
||||
|
||||
list_lengths = [random.randint(0, 500) for _ in range(50)]
|
||||
list_lengths[::10] = [0, 0, 0, 0, 0]
|
||||
|
||||
list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
|
||||
if i % 8 else None
|
||||
for i, x in enumerate(list_lengths)]
|
||||
|
||||
a1 = pa.array(list_values)
|
||||
|
||||
table = pa.Table.from_arrays([a1], ['int_lists'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
list_types = [
|
||||
(pa.ListType, pa.list_),
|
||||
(pa.LargeListType, pa.large_list),
|
||||
]
|
||||
|
||||
|
||||
def test_list_types():
|
||||
data = [[1, 2, None]] * 50
|
||||
for _, in_factory in list_types:
|
||||
array = pa.array(data, type=in_factory(pa.int32()))
|
||||
table = pa.Table.from_arrays([array], ['lists'])
|
||||
for out_type, out_factory in list_types:
|
||||
for store_schema in (True, False):
|
||||
if store_schema:
|
||||
expected_table = table
|
||||
else:
|
||||
expected_table = pa.Table.from_arrays(
|
||||
[pa.array(data, type=out_factory(pa.int32()))], ['lists'])
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=dict(store_schema=store_schema),
|
||||
read_table_kwargs=dict(list_type=out_type))
|
||||
assert result == expected_table
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_nested_convenience(tempdir):
|
||||
# ARROW-1684
|
||||
df = pd.DataFrame({
|
||||
'a': [[1, 2, 3], None, [4, 5], []],
|
||||
'b': [[1.], None, None, [6., 7.]],
|
||||
})
|
||||
|
||||
path = str(tempdir / 'nested_convenience.parquet')
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
_write_table(table, path)
|
||||
|
||||
read = pq.read_table(
|
||||
path, columns=['a'])
|
||||
tm.assert_frame_equal(read.to_pandas(), df[['a']])
|
||||
|
||||
read = pq.read_table(
|
||||
path, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(read.to_pandas(), df)
|
||||
|
||||
|
||||
# Binary
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_fixed_size_binary():
|
||||
t0 = pa.binary(10)
|
||||
data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
|
||||
a0 = pa.array(data, type=t0)
|
||||
|
||||
table = pa.Table.from_arrays([a0],
|
||||
['binary[10]'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
def test_binary_types():
|
||||
types = [pa.binary(), pa.large_binary(), pa.binary_view()]
|
||||
data = [b'abc', None, b'defg', b'x' * 30]
|
||||
for in_type in types:
|
||||
array = pa.array(data, in_type)
|
||||
table = pa.Table.from_arrays([array], ['binary'])
|
||||
for out_type in types:
|
||||
for store_schema in (False, True):
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=dict(store_schema=store_schema),
|
||||
read_table_kwargs=dict(binary_type=out_type))
|
||||
if store_schema:
|
||||
expected_table = table
|
||||
else:
|
||||
expected_table = pa.Table.from_arrays(
|
||||
[pa.array(data, out_type)], ['binary'])
|
||||
assert result == expected_table
|
||||
|
||||
|
||||
# Large types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_large_table_int32_overflow():
|
||||
size = np.iinfo('int32').max + 1
|
||||
|
||||
arr = np.ones(size, dtype='uint8')
|
||||
|
||||
parr = pa.array(arr, type=pa.uint8())
|
||||
|
||||
table = pa.Table.from_arrays([parr], names=['one'])
|
||||
f = io.BytesIO()
|
||||
_write_table(table, f)
|
||||
|
||||
|
||||
def _simple_table_roundtrip(table, **write_kwargs):
|
||||
stream = pa.BufferOutputStream()
|
||||
_write_table(table, stream, **write_kwargs)
|
||||
buf = stream.getvalue()
|
||||
return _read_table(buf)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_byte_array_exactly_2gb():
|
||||
# Test edge case reported in ARROW-3762
|
||||
val = b'x' * (1 << 10)
|
||||
|
||||
base = pa.array([val] * ((1 << 21) - 1))
|
||||
cases = [
|
||||
[b'x' * 1023], # 2^31 - 1
|
||||
[b'x' * 1024], # 2^31
|
||||
[b'x' * 1025] # 2^31 + 1
|
||||
]
|
||||
for case in cases:
|
||||
values = pa.chunked_array([base, pa.array(case)])
|
||||
t = pa.table([values], names=['f0'])
|
||||
result = _simple_table_roundtrip(
|
||||
t, use_dictionary=False)
|
||||
assert t.equals(result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.large_memory
|
||||
def test_binary_array_overflow_to_chunked():
|
||||
# ARROW-3762
|
||||
|
||||
# 2^31 + 1 bytes
|
||||
values = [b'x'] + [
|
||||
b'x' * (1 << 20)
|
||||
] * 2 * (1 << 10)
|
||||
df = pd.DataFrame({'byte_col': values})
|
||||
|
||||
tbl = pa.Table.from_pandas(df, preserve_index=False)
|
||||
read_tbl = _simple_table_roundtrip(tbl)
|
||||
|
||||
col0_data = read_tbl[0]
|
||||
assert isinstance(col0_data, pa.ChunkedArray)
|
||||
|
||||
# Split up into 2GB chunks
|
||||
assert col0_data.num_chunks == 2
|
||||
|
||||
assert tbl.equals(read_tbl)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.large_memory
|
||||
def test_list_of_binary_large_cell():
|
||||
# ARROW-4688
|
||||
data = []
|
||||
|
||||
# TODO(wesm): handle chunked children
|
||||
# 2^31 - 1 bytes in a single cell
|
||||
# data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
|
||||
|
||||
# A little under 2GB in cell each containing approximately 10MB each
|
||||
data.extend([[b'x' * 1000000] * 10] * 214)
|
||||
|
||||
arr = pa.array(data)
|
||||
table = pa.Table.from_arrays([arr], ['chunky_cells'])
|
||||
read_table = _simple_table_roundtrip(table)
|
||||
assert table.equals(read_table)
|
||||
|
||||
|
||||
def test_large_binary_and_binary_view():
|
||||
data = [b'foo', b'bar'] * 50
|
||||
for type in [pa.large_binary(), pa.binary_view()]:
|
||||
arr = pa.array(data, type=type)
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
_check_roundtrip(table, use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_large_binary_and_binary_view_huge():
|
||||
s = b'xy' * 997
|
||||
data = [s] * ((1 << 33) // len(s))
|
||||
for type in [pa.large_binary(), pa.binary_view()]:
|
||||
arr = pa.array(data, type=type)
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
_check_roundtrip(table, use_dictionary=use_dictionary)
|
||||
del arr, table
|
||||
|
||||
|
||||
@pytest.mark.large_memory
|
||||
def test_large_binary_overflow():
|
||||
s = b'x' * (1 << 31)
|
||||
arr = pa.array([s], type=pa.large_binary())
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
writer = pa.BufferOutputStream()
|
||||
with pytest.raises(
|
||||
pa.ArrowInvalid,
|
||||
match="Parquet cannot store strings with size 2GB or more"):
|
||||
_write_table(table, writer, use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("storage_type", (
|
||||
pa.string(), pa.large_string()))
|
||||
def test_json_extension_type(storage_type):
|
||||
data = ['{"a": 1}', '{"b": 2}', None]
|
||||
arr = pa.array(data, type=pa.json_(storage_type))
|
||||
|
||||
table = pa.table([arr], names=["ext"])
|
||||
|
||||
# With defaults, this should roundtrip (because store_schema=True)
|
||||
_check_roundtrip(table, table)
|
||||
|
||||
# When store_schema is False, we get a string back by default
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.string())}),
|
||||
{"arrow_extensions_enabled": False},
|
||||
store_schema=False)
|
||||
|
||||
# With arrow_extensions_enabled=True on read, we get a arrow.json back
|
||||
# (but with string() storage)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.json_(pa.string()))}),
|
||||
{"arrow_extensions_enabled": True},
|
||||
store_schema=False)
|
||||
|
||||
|
||||
def test_uuid_extension_type():
|
||||
data = [
|
||||
b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
|
||||
b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
|
||||
None
|
||||
]
|
||||
arr = pa.array(data, type=pa.uuid())
|
||||
|
||||
table = pa.table([arr], names=["ext"])
|
||||
|
||||
_check_roundtrip(table, table)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.binary(16))}),
|
||||
{"arrow_extensions_enabled": False},
|
||||
store_schema=False)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
table,
|
||||
{"arrow_extensions_enabled": True},
|
||||
store_schema=False)
|
||||
|
||||
|
||||
def test_undefined_logical_type(parquet_test_datadir):
|
||||
test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
|
||||
|
||||
table = _read_table(test_file)
|
||||
assert table.column_names == ["column with known type", "column with unknown type"]
|
||||
assert table["column with unknown type"].to_pylist() == [
|
||||
b"unknown string 1",
|
||||
b"unknown string 2",
|
||||
b"unknown string 3"
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,461 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import datetime
|
||||
import io
|
||||
import warnings
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_datetime_tz():
|
||||
# Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
|
||||
# so we need to cast the pandas dtype. Pandas v1 will always silently
|
||||
# coerce to [ns] due to lack of non-[ns] support.
|
||||
s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
|
||||
s = s.dt.tz_localize('utc')
|
||||
s.index = s
|
||||
|
||||
# Both a column and an index to hit both use cases
|
||||
df = pd.DataFrame({'tz_aware': s,
|
||||
'tz_eastern': s.dt.tz_convert('US/Eastern')},
|
||||
index=s)
|
||||
|
||||
f = io.BytesIO()
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
_write_table(arrow_table, f)
|
||||
f.seek(0)
|
||||
|
||||
table_read = pq.read_pandas(f)
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_datetime_timezone_tzinfo():
|
||||
value = datetime.datetime(2018, 1, 1, 1, 23, 45,
|
||||
tzinfo=datetime.timezone.utc)
|
||||
df = pd.DataFrame({'foo': [value]})
|
||||
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={})
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_coerce_timestamps(tempdir):
|
||||
from collections import OrderedDict
|
||||
|
||||
# ARROW-622
|
||||
arrays = OrderedDict()
|
||||
fields = [pa.field('datetime64',
|
||||
pa.list_(pa.timestamp('ms')))]
|
||||
arrays['datetime64'] = [
|
||||
np.array(['2007-07-13T01:23:34.123456789',
|
||||
None,
|
||||
'2010-08-13T05:46:57.437699912'],
|
||||
dtype='datetime64[ms]'),
|
||||
None,
|
||||
None,
|
||||
np.array(['2007-07-13T02',
|
||||
None,
|
||||
'2010-08-13T05:46:57.437699912'],
|
||||
dtype='datetime64[ms]'),
|
||||
]
|
||||
|
||||
df = pd.DataFrame(arrays)
|
||||
schema = pa.schema(fields)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
df_expected = df.copy()
|
||||
for i, x in enumerate(df_expected['datetime64']):
|
||||
if isinstance(x, np.ndarray):
|
||||
df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
|
||||
|
||||
tm.assert_frame_equal(df_expected, df_read)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
coerce_timestamps='unknown')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_coerce_timestamps_truncated(tempdir):
|
||||
"""
|
||||
ARROW-2555: Test that we can truncate timestamps when coercing if
|
||||
explicitly allowed.
|
||||
"""
|
||||
dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
|
||||
second=1, microsecond=1)
|
||||
dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
|
||||
second=1)
|
||||
|
||||
fields_us = [pa.field('datetime64', pa.timestamp('us'))]
|
||||
arrays_us = {'datetime64': [dt_us, dt_ms]}
|
||||
|
||||
df_us = pd.DataFrame(arrays_us)
|
||||
schema_us = pa.schema(fields_us)
|
||||
|
||||
filename = tempdir / 'pandas_truncated.parquet'
|
||||
table_us = pa.Table.from_pandas(df_us, schema=schema_us)
|
||||
|
||||
_write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
|
||||
allow_truncated_timestamps=True)
|
||||
table_ms = _read_table(filename)
|
||||
df_ms = table_ms.to_pandas()
|
||||
|
||||
arrays_expected = {'datetime64': [dt_ms, dt_ms]}
|
||||
df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]')
|
||||
tm.assert_frame_equal(df_expected, df_ms)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_date_time_types(tempdir):
|
||||
t1 = pa.date32()
|
||||
data1 = np.array([17259, 17260, 17261], dtype='int32')
|
||||
a1 = pa.array(data1, type=t1)
|
||||
|
||||
t2 = pa.date64()
|
||||
data2 = data1.astype('int64') * 86400000
|
||||
a2 = pa.array(data2, type=t2)
|
||||
|
||||
t3 = pa.timestamp('us')
|
||||
start = pd.Timestamp('2001-01-01').value / 1000
|
||||
data3 = np.array([start, start + 1, start + 2], dtype='int64')
|
||||
a3 = pa.array(data3, type=t3)
|
||||
|
||||
t4 = pa.time32('ms')
|
||||
data4 = np.arange(3, dtype='i4')
|
||||
a4 = pa.array(data4, type=t4)
|
||||
|
||||
t5 = pa.time64('us')
|
||||
a5 = pa.array(data4.astype('int64'), type=t5)
|
||||
|
||||
t6 = pa.time32('s')
|
||||
a6 = pa.array(data4, type=t6)
|
||||
|
||||
ex_t6 = pa.time32('ms')
|
||||
ex_a6 = pa.array(data4 * 1000, type=ex_t6)
|
||||
|
||||
t7 = pa.timestamp('ns')
|
||||
start = pd.Timestamp('2001-01-01').value
|
||||
data7 = np.array([start, start + 1000, start + 2000],
|
||||
dtype='int64')
|
||||
a7 = pa.array(data7, type=t7)
|
||||
|
||||
table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
|
||||
['date32', 'date64', 'timestamp[us]',
|
||||
'time32[s]', 'time64[us]',
|
||||
'time32_from64[s]',
|
||||
'timestamp[ns]'])
|
||||
|
||||
# date64 as date32
|
||||
# time32[s] to time32[ms]
|
||||
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
|
||||
['date32', 'date64', 'timestamp[us]',
|
||||
'time32[s]', 'time64[us]',
|
||||
'time32_from64[s]',
|
||||
'timestamp[ns]'])
|
||||
|
||||
_check_roundtrip(table, expected=expected, version='2.6')
|
||||
|
||||
t0 = pa.timestamp('ms')
|
||||
data0 = np.arange(4, dtype='int64')
|
||||
a0 = pa.array(data0, type=t0)
|
||||
|
||||
t1 = pa.timestamp('us')
|
||||
data1 = np.arange(4, dtype='int64')
|
||||
a1 = pa.array(data1, type=t1)
|
||||
|
||||
t2 = pa.timestamp('ns')
|
||||
data2 = np.arange(4, dtype='int64')
|
||||
a2 = pa.array(data2, type=t2)
|
||||
|
||||
table = pa.Table.from_arrays([a0, a1, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
expected = pa.Table.from_arrays([a0, a1, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
|
||||
# int64 for all timestamps supported by default
|
||||
filename = tempdir / 'int64_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6')
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT64'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
t0_ns = pa.timestamp('ns')
|
||||
data0_ns = np.array(data0 * 1000000, dtype='int64')
|
||||
a0_ns = pa.array(data0_ns, type=t0_ns)
|
||||
|
||||
t1_ns = pa.timestamp('ns')
|
||||
data1_ns = np.array(data1 * 1000, dtype='int64')
|
||||
a1_ns = pa.array(data1_ns, type=t1_ns)
|
||||
|
||||
expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
|
||||
# int96 nanosecond timestamps produced upon request
|
||||
filename = tempdir / 'explicit_int96_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6',
|
||||
use_deprecated_int96_timestamps=True)
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT96'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
# int96 nanosecond timestamps implied by flavor 'spark'
|
||||
filename = tempdir / 'spark_int96_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6',
|
||||
flavor='spark')
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT96'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
|
||||
def test_coerce_int96_timestamp_unit(unit):
|
||||
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
|
||||
|
||||
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
|
||||
d_ms = d_s * 1000
|
||||
d_us = d_ms * 1000
|
||||
d_ns = d_us * 1000
|
||||
|
||||
a_s = pa.array(d_s, type=pa.timestamp('s'))
|
||||
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
|
||||
a_us = pa.array(d_us, type=pa.timestamp('us'))
|
||||
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
|
||||
|
||||
arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
|
||||
names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
|
||||
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
|
||||
|
||||
# For either Parquet version, coercing to nanoseconds is allowed
|
||||
# if Int96 storage is used
|
||||
expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
|
||||
read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
|
||||
_check_roundtrip(table, expected,
|
||||
read_table_kwargs=read_table_kwargs,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
_check_roundtrip(table, expected, version='2.6',
|
||||
read_table_kwargs=read_table_kwargs,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
|
||||
def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
|
||||
|
||||
def get_table(pq_reader_method, filename, **kwargs):
|
||||
if pq_reader_method == "ParquetFile":
|
||||
return pq.ParquetFile(filename, **kwargs).read()
|
||||
elif pq_reader_method == "read_table":
|
||||
return pq.read_table(filename, **kwargs)
|
||||
|
||||
# Recreating the initial JIRA issue referenced in ARROW-12096
|
||||
oob_dts = [
|
||||
datetime.datetime(1000, 1, 1),
|
||||
datetime.datetime(2000, 1, 1),
|
||||
datetime.datetime(3000, 1, 1)
|
||||
]
|
||||
df = pd.DataFrame({"a": oob_dts})
|
||||
table = pa.table(df)
|
||||
|
||||
filename = tempdir / "test_round_trip_overflow.parquet"
|
||||
pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
|
||||
version="1.0")
|
||||
|
||||
# with the default resolution of ns, we get wrong values for INT96
|
||||
# that are out of bounds for nanosecond range
|
||||
tab_error = get_table(pq_reader_method, filename)
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore",
|
||||
"Discarding nonzero nanoseconds in conversion",
|
||||
UserWarning)
|
||||
assert tab_error["a"].to_pylist() != oob_dts
|
||||
|
||||
# avoid this overflow by specifying the resolution to use for INT96 values
|
||||
tab_correct = get_table(
|
||||
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
|
||||
)
|
||||
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
|
||||
df["a"] = df["a"].astype(object)
|
||||
tm.assert_frame_equal(df, df_correct)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
|
||||
def test_timestamp_restore_timezone(unit):
|
||||
# ARROW-5888, restore timezone from serialized metadata
|
||||
ty = pa.timestamp(unit, tz='America/New_York')
|
||||
arr = pa.array([1, 2, 3], type=ty)
|
||||
t = pa.table([arr], names=['f0'])
|
||||
_check_roundtrip(t)
|
||||
|
||||
|
||||
def test_timestamp_restore_timezone_nanosecond():
|
||||
# ARROW-9634, also restore timezone for nanosecond data that get stored
|
||||
# as microseconds in the parquet file for Parquet ver 2.4 and less
|
||||
ty = pa.timestamp('ns', tz='America/New_York')
|
||||
arr = pa.array([1000, 2000, 3000], type=ty)
|
||||
table = pa.table([arr], names=['f0'])
|
||||
ty_us = pa.timestamp('us', tz='America/New_York')
|
||||
expected = pa.table([arr.cast(ty_us)], names=['f0'])
|
||||
_check_roundtrip(table, expected=expected, version='2.4')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_list_of_datetime_time_roundtrip():
|
||||
# ARROW-4135
|
||||
times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
|
||||
'11:30', '12:00'], format="%H:%M")
|
||||
df = pd.DataFrame({'time': [times.time]})
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={})
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_version_timestamp_differences():
|
||||
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
|
||||
|
||||
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
|
||||
d_ms = d_s * 1000
|
||||
d_us = d_ms * 1000
|
||||
d_ns = d_us * 1000
|
||||
|
||||
a_s = pa.array(d_s, type=pa.timestamp('s'))
|
||||
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
|
||||
a_us = pa.array(d_us, type=pa.timestamp('us'))
|
||||
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
|
||||
|
||||
all_versions = ['1.0', '2.4', '2.6']
|
||||
|
||||
names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
|
||||
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
|
||||
|
||||
# Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
|
||||
# and nanoseconds should be coerced to microseconds by default
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
|
||||
_check_roundtrip(table, expected, version='1.0')
|
||||
_check_roundtrip(table, expected, version='2.4')
|
||||
|
||||
# Using Parquet version 2.6, seconds should be coerced to milliseconds
|
||||
# and nanoseconds should be retained by default
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
|
||||
_check_roundtrip(table, expected, version='2.6')
|
||||
|
||||
# For either Parquet version coercing to milliseconds or microseconds
|
||||
# is allowed
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
|
||||
|
||||
expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
|
||||
|
||||
# TODO: after pyarrow allows coerce_timestamps='ns', tests like the
|
||||
# following should pass ...
|
||||
|
||||
# Using Parquet version 1.0, coercing to nanoseconds is not allowed
|
||||
# expected = None
|
||||
# with pytest.raises(NotImplementedError):
|
||||
# _roundtrip_table(table, coerce_timestamps='ns')
|
||||
|
||||
# Using Parquet version 2.0, coercing to nanoseconds is allowed
|
||||
# expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
|
||||
# _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
|
||||
|
||||
# For either Parquet version, coercing to nanoseconds is allowed
|
||||
# if Int96 storage is used
|
||||
expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, version=ver,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_noncoerced_nanoseconds_written_without_exception(tempdir):
|
||||
# ARROW-1957: the Parquet version 2.0 writer preserves Arrow
|
||||
# nanosecond timestamps by default
|
||||
n = 9
|
||||
df = pd.DataFrame({'x': range(n)},
|
||||
index=pd.date_range('2017-01-01', freq='ns', periods=n))
|
||||
tb = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'written.parquet'
|
||||
try:
|
||||
pq.write_table(tb, filename, version='2.6')
|
||||
except Exception:
|
||||
pass
|
||||
assert filename.exists()
|
||||
|
||||
recovered_table = pq.read_table(filename)
|
||||
assert tb.equals(recovered_table)
|
||||
|
||||
# Loss of data through coercion (without explicit override) still an error
|
||||
filename = tempdir / 'not_written.parquet'
|
||||
with pytest.raises(ValueError):
|
||||
pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
|
||||
|
||||
|
||||
def test_duration_type():
|
||||
# ARROW-6780
|
||||
arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
|
||||
for unit in ["s", "ms", "us", "ns"]]
|
||||
table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
|
||||
|
||||
_check_roundtrip(table)
|
||||
@@ -0,0 +1,620 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import pytest
|
||||
from datetime import timedelta
|
||||
|
||||
import pyarrow as pa
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.parquet.encryption as pe
|
||||
except ImportError:
|
||||
pq = None
|
||||
pe = None
|
||||
else:
|
||||
from pyarrow.tests.parquet.encryption import (
|
||||
InMemoryKmsClient, verify_file_encrypted)
|
||||
|
||||
|
||||
PARQUET_NAME = 'encrypted_table.in_mem.parquet'
|
||||
FOOTER_KEY = b"0123456789112345"
|
||||
FOOTER_KEY_NAME = "footer_key"
|
||||
COL_KEY = b"1234567890123450"
|
||||
COL_KEY_NAME = "col_key"
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet_encryption'
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = [
|
||||
pytest.mark.parquet_encryption,
|
||||
pytest.mark.parquet
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def data_table():
|
||||
data_table = pa.Table.from_pydict({
|
||||
'a': pa.array([1, 2, 3]),
|
||||
'b': pa.array(['a', 'b', 'c']),
|
||||
'c': pa.array(['x', 'y', 'z'])
|
||||
})
|
||||
return data_table
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def basic_encryption_config():
|
||||
basic_encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
})
|
||||
return basic_encryption_config
|
||||
|
||||
|
||||
def setup_encryption_environment(custom_kms_conf):
|
||||
"""
|
||||
Sets up and returns the KMS connection configuration and crypto factory
|
||||
based on provided KMS configuration parameters.
|
||||
"""
|
||||
kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
# Create our CryptoFactory
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
|
||||
return kms_connection_config, crypto_factory
|
||||
|
||||
|
||||
def write_encrypted_file(path, data_table, footer_key_name, col_key_name,
|
||||
footer_key, col_key, encryption_config):
|
||||
"""
|
||||
Writes an encrypted parquet file based on the provided parameters.
|
||||
"""
|
||||
# Setup the custom KMS configuration with provided keys
|
||||
custom_kms_conf = {
|
||||
footer_key_name: footer_key.decode("UTF-8"),
|
||||
col_key_name: col_key.decode("UTF-8"),
|
||||
}
|
||||
|
||||
# Setup encryption environment
|
||||
kms_connection_config, crypto_factory = setup_encryption_environment(
|
||||
custom_kms_conf)
|
||||
|
||||
# Write the encrypted parquet file
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
return kms_connection_config, crypto_factory
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_read(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted, and then read it."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
assert encryption_config.uniform_encryption is False
|
||||
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Read with decryption properties
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
result_table = read_encrypted_parquet(
|
||||
path, decryption_config, kms_connection_config, crypto_factory)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_uniform_encrypted_parquet_write_read(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted, and then read it."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer and all columns with the footer key,
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
uniform_encryption=True,
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
assert encryption_config.uniform_encryption is True
|
||||
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"",
|
||||
encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Read with decryption properties
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
result_table = read_encrypted_parquet(
|
||||
path, decryption_config, kms_connection_config, crypto_factory)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def write_encrypted_parquet(path, table, encryption_config,
|
||||
kms_connection_config, crypto_factory):
|
||||
file_encryption_properties = crypto_factory.file_encryption_properties(
|
||||
kms_connection_config, encryption_config)
|
||||
assert file_encryption_properties is not None
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema,
|
||||
encryption_properties=file_encryption_properties) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
|
||||
def read_encrypted_parquet(path, decryption_config,
|
||||
kms_connection_config, crypto_factory):
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
assert file_decryption_properties is not None
|
||||
meta = pq.read_metadata(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
assert meta.num_columns == 3
|
||||
schema = pq.read_schema(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
assert len(schema.names) == 3
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
return result.read(use_threads=True)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
and then read it using wrong keys."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, COL_KEY, encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({
|
||||
FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), # Intentionally wrong
|
||||
COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), # Intentionally wrong
|
||||
})
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
with pytest.raises(ValueError, match=r"Incorrect master key used"):
|
||||
read_encrypted_parquet(
|
||||
path, decryption_config, wrong_kms_connection_config,
|
||||
wrong_crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read it without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
# Read without decryption properties
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.ParquetFile(tempdir / PARQUET_NAME).read()
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_metadata_no_decryption_config(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read its metadata without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
# Read metadata without decryption properties
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.read_metadata(tempdir / PARQUET_NAME)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_schema_no_decryption_config(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read its schema without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.read_schema(tempdir / PARQUET_NAME)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_no_col_key(tempdir, data_table):
|
||||
"""Write an encrypted parquet, but give only footer key,
|
||||
without column key."""
|
||||
path = tempdir / 'encrypted_table_no_col_key.in_mem.parquet'
|
||||
|
||||
# Encrypt the footer with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME)
|
||||
|
||||
with pytest.raises(OSError,
|
||||
match="Either column_keys or uniform_encryption "
|
||||
"must be set"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, b"", encryption_config)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_table):
|
||||
"""Write an encrypted parquet, but give only footer key,
|
||||
without column key."""
|
||||
path = tempdir / 'encrypted_table_col_key_and_uniform_encryption.in_mem.parquet'
|
||||
|
||||
# Encrypt the footer with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
uniform_encryption=True)
|
||||
|
||||
with pytest.raises(OSError,
|
||||
match=r"Cannot set both column_keys and uniform_encryption"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, b"", encryption_config)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
# Empty master keys map will cause KeyError to be raised
|
||||
# on wrap/unwrap calls
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(KeyError, match="footer_key"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_specific_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
class ThrowingKmsClient(pe.KmsClient):
|
||||
"""A KmsClient implementation that throws exception in
|
||||
wrap/unwrap calls
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""Create an InMemoryKmsClient instance."""
|
||||
pe.KmsClient.__init__(self)
|
||||
self.config = config
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
raise ValueError("Cannot Wrap Key")
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
raise ValueError("Cannot Unwrap Key")
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
# Exception thrown in wrap/unwrap calls
|
||||
return ThrowingKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(ValueError, match="Cannot Wrap Key"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_factory_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise ValueError in kms_factory."""
|
||||
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
raise ValueError('Cannot create KmsClient')
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(ValueError,
|
||||
match="Cannot create KmsClient"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_factory_type_error(
|
||||
tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet, but use wrong KMS client type
|
||||
that doesn't implement KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
class WrongTypeKmsClient():
|
||||
"""This is not an implementation of KmsClient.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.master_keys_map = config.custom_kms_conf
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
return None
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
return None
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return WrongTypeKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(TypeError):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_encryption_configuration():
|
||||
def validate_encryption_configuration(encryption_config):
|
||||
assert FOOTER_KEY_NAME == encryption_config.footer_key
|
||||
assert ["a", "b"] == encryption_config.column_keys[COL_KEY_NAME]
|
||||
assert "AES_GCM_CTR_V1" == encryption_config.encryption_algorithm
|
||||
assert encryption_config.plaintext_footer
|
||||
assert not encryption_config.double_wrapping
|
||||
assert timedelta(minutes=10.0) == encryption_config.cache_lifetime
|
||||
assert not encryption_config.internal_key_material
|
||||
assert 192 == encryption_config.data_key_length_bits
|
||||
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={COL_KEY_NAME: ["a", "b"], },
|
||||
encryption_algorithm="AES_GCM_CTR_V1",
|
||||
plaintext_footer=True,
|
||||
double_wrapping=False,
|
||||
cache_lifetime=timedelta(minutes=10.0),
|
||||
internal_key_material=False,
|
||||
data_key_length_bits=192,
|
||||
)
|
||||
validate_encryption_configuration(encryption_config)
|
||||
|
||||
encryption_config_1 = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME)
|
||||
encryption_config_1.column_keys = {COL_KEY_NAME: ["a", "b"], }
|
||||
encryption_config_1.encryption_algorithm = "AES_GCM_CTR_V1"
|
||||
encryption_config_1.plaintext_footer = True
|
||||
encryption_config_1.double_wrapping = False
|
||||
encryption_config_1.cache_lifetime = timedelta(minutes=10.0)
|
||||
encryption_config_1.internal_key_material = False
|
||||
encryption_config_1.data_key_length_bits = 192
|
||||
validate_encryption_configuration(encryption_config_1)
|
||||
|
||||
|
||||
def test_encrypted_parquet_decryption_configuration():
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=10.0))
|
||||
assert timedelta(minutes=10.0) == decryption_config.cache_lifetime
|
||||
|
||||
decryption_config_1 = pe.DecryptionConfiguration()
|
||||
decryption_config_1.cache_lifetime = timedelta(minutes=10.0)
|
||||
assert timedelta(minutes=10.0) == decryption_config_1.cache_lifetime
|
||||
|
||||
|
||||
def test_encrypted_parquet_kms_configuration():
|
||||
def validate_kms_connection_config(kms_connection_config):
|
||||
assert "Instance1" == kms_connection_config.kms_instance_id
|
||||
assert "URL1" == kms_connection_config.kms_instance_url
|
||||
assert "MyToken" == kms_connection_config.key_access_token
|
||||
assert ({"key1": "key_material_1", "key2": "key_material_2"} ==
|
||||
kms_connection_config.custom_kms_conf)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
kms_instance_id="Instance1",
|
||||
kms_instance_url="URL1",
|
||||
key_access_token="MyToken",
|
||||
custom_kms_conf={
|
||||
"key1": "key_material_1",
|
||||
"key2": "key_material_2",
|
||||
})
|
||||
validate_kms_connection_config(kms_connection_config)
|
||||
|
||||
kms_connection_config_1 = pe.KmsConnectionConfig()
|
||||
kms_connection_config_1.kms_instance_id = "Instance1"
|
||||
kms_connection_config_1.kms_instance_url = "URL1"
|
||||
kms_connection_config_1.key_access_token = "MyToken"
|
||||
kms_connection_config_1.custom_kms_conf = {
|
||||
"key1": "key_material_1",
|
||||
"key2": "key_material_2",
|
||||
}
|
||||
validate_kms_connection_config(kms_connection_config_1)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Plaintext footer - reading plaintext column subset"
|
||||
" reads encrypted columns too")
|
||||
def test_encrypted_parquet_write_read_plain_footer_single_wrapping(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, with plaintext footer
|
||||
and with single wrapping,
|
||||
verify it's encrypted, and then read plaintext columns."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
plaintext_footer=True,
|
||||
double_wrapping=False)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
custom_kms_conf={
|
||||
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
|
||||
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
|
||||
}
|
||||
)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
# # Read without decryption properties only the plaintext column
|
||||
# result = pq.ParquetFile(path)
|
||||
# result_table = result.read(columns='c', use_threads=False)
|
||||
# assert table.num_rows == result_table.num_rows
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="External key material not supported yet")
|
||||
def test_encrypted_parquet_write_external(tempdir, data_table):
|
||||
"""Write an encrypted parquet, with external key
|
||||
material.
|
||||
Currently it's not implemented, so should throw
|
||||
an exception"""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the file with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={},
|
||||
internal_key_material=False)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
custom_kms_conf={FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8")}
|
||||
)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
and then read it multithreaded in a loop."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext, defined in basic_encryption_config
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
|
||||
for i in range(50):
|
||||
# Read with decryption properties
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
assert file_decryption_properties is not None
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
result_table = result.read(use_threads=True)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_config):
|
||||
"""
|
||||
Test that decryption properties can be used if the crypto factory is no longer alive
|
||||
"""
|
||||
path = tempdir / PARQUET_NAME
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Create decryption properties and delete the crypto factory that created
|
||||
# the properties afterwards.
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
del crypto_factory
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
result_table = result.read(use_threads=True)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet then read it back using read_table."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Write the encrypted parquet file using the utility function
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
|
||||
# Read the encrypted parquet file using read_table
|
||||
result_table = pq.read_table(path, decryption_properties=file_decryption_properties)
|
||||
|
||||
# Assert that the read table matches the original data
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
# Read the encrypted parquet folder using read_table
|
||||
result_table = pq.read_table(
|
||||
tempdir, decryption_properties=file_decryption_properties)
|
||||
assert data_table.equals(result_table)
|
||||
@@ -0,0 +1,816 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import datetime
|
||||
import decimal
|
||||
from collections import OrderedDict
|
||||
import io
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
|
||||
from pyarrow.fs import LocalFileSystem
|
||||
from pyarrow.tests import util
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_metadata_api():
|
||||
df = alltypes_sample(size=10000)
|
||||
df = df.reindex(columns=sorted(df.columns))
|
||||
df.index = np.random.randint(0, 1000000, size=len(df))
|
||||
|
||||
fileh = make_sample_file(df)
|
||||
ncols = len(df.columns)
|
||||
|
||||
# Series of sniff tests
|
||||
meta = fileh.metadata
|
||||
repr(meta)
|
||||
assert meta.num_rows == len(df)
|
||||
assert meta.num_columns == ncols + 1 # +1 for index
|
||||
assert meta.num_row_groups == 1
|
||||
assert meta.format_version == '2.6'
|
||||
assert 'parquet-cpp' in meta.created_by
|
||||
assert isinstance(meta.serialized_size, int)
|
||||
assert isinstance(meta.metadata, dict)
|
||||
|
||||
# Schema
|
||||
schema = fileh.schema
|
||||
assert meta.schema is schema
|
||||
assert len(schema) == ncols + 1 # +1 for index
|
||||
repr(schema)
|
||||
|
||||
col = schema[0]
|
||||
repr(col)
|
||||
assert col.name == df.columns[0]
|
||||
assert col.max_definition_level == 1
|
||||
assert col.max_repetition_level == 0
|
||||
assert col.max_repetition_level == 0
|
||||
assert col.physical_type == 'BOOLEAN'
|
||||
assert col.converted_type == 'NONE'
|
||||
|
||||
col_float16 = schema[5]
|
||||
assert col_float16.logical_type.type == 'FLOAT16'
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
schema[ncols + 1] # +1 for index
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
schema[-1]
|
||||
|
||||
# Row group
|
||||
for rg in range(meta.num_row_groups):
|
||||
rg_meta = meta.row_group(rg)
|
||||
assert isinstance(rg_meta, pq.RowGroupMetaData)
|
||||
repr(rg_meta)
|
||||
|
||||
for col in range(rg_meta.num_columns):
|
||||
col_meta = rg_meta.column(col)
|
||||
assert isinstance(col_meta, pq.ColumnChunkMetaData)
|
||||
repr(col_meta)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
meta.row_group(-1)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
meta.row_group(meta.num_row_groups + 1)
|
||||
|
||||
rg_meta = meta.row_group(0)
|
||||
assert rg_meta.num_rows == len(df)
|
||||
assert rg_meta.num_columns == ncols + 1 # +1 for index
|
||||
assert rg_meta.total_byte_size > 0
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
col_meta = rg_meta.column(-1)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
col_meta = rg_meta.column(ncols + 2)
|
||||
|
||||
col_meta = rg_meta.column(0)
|
||||
assert col_meta.file_offset == 0
|
||||
assert col_meta.file_path == '' # created from BytesIO
|
||||
assert col_meta.physical_type == 'BOOLEAN'
|
||||
assert col_meta.num_values == 10000
|
||||
assert col_meta.path_in_schema == 'bool'
|
||||
assert col_meta.is_stats_set is True
|
||||
assert isinstance(col_meta.statistics, pq.Statistics)
|
||||
assert col_meta.compression == 'SNAPPY'
|
||||
assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
|
||||
assert col_meta.has_dictionary_page is False
|
||||
assert col_meta.dictionary_page_offset is None
|
||||
assert col_meta.data_page_offset > 0
|
||||
assert col_meta.total_compressed_size > 0
|
||||
assert col_meta.total_uncompressed_size > 0
|
||||
with pytest.raises(NotImplementedError):
|
||||
col_meta.has_index_page
|
||||
with pytest.raises(NotImplementedError):
|
||||
col_meta.index_page_offset
|
||||
|
||||
|
||||
def test_parquet_metadata_lifetime(tempdir):
|
||||
# ARROW-6642 - ensure that chained access keeps parent objects alive
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
|
||||
parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
|
||||
parquet_file.metadata.row_group(0).column(0).statistics
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize(
|
||||
(
|
||||
'data',
|
||||
'type',
|
||||
'physical_type',
|
||||
'min_value',
|
||||
'max_value',
|
||||
'null_count',
|
||||
'num_values',
|
||||
'distinct_count'
|
||||
),
|
||||
[
|
||||
([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
|
||||
(
|
||||
[-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
|
||||
'FLOAT', -1.1, 4.4, 1, 4, None
|
||||
),
|
||||
(
|
||||
[-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
|
||||
'DOUBLE', -1.1, 4.4, 1, 4, None
|
||||
),
|
||||
(
|
||||
['', 'b', chr(1000), None, 'aaa'], pa.binary(),
|
||||
'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, None
|
||||
),
|
||||
(
|
||||
[True, False, False, True, True], pa.bool_(),
|
||||
'BOOLEAN', False, True, 0, 5, None
|
||||
),
|
||||
(
|
||||
[b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
|
||||
'BYTE_ARRAY', b'\x00', b'b', 1, 4, None
|
||||
),
|
||||
]
|
||||
)
|
||||
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
|
||||
max_value, null_count, num_values,
|
||||
distinct_count):
|
||||
df = pd.DataFrame({'data': data})
|
||||
schema = pa.schema([pa.field('data', type)])
|
||||
table = pa.Table.from_pandas(df, schema=schema, safe=False)
|
||||
fileh = make_sample_file(table)
|
||||
|
||||
meta = fileh.metadata
|
||||
|
||||
rg_meta = meta.row_group(0)
|
||||
col_meta = rg_meta.column(0)
|
||||
|
||||
stat = col_meta.statistics
|
||||
assert stat.has_min_max
|
||||
assert _close(type, stat.min, min_value)
|
||||
assert _close(type, stat.max, max_value)
|
||||
assert stat.null_count == null_count
|
||||
assert stat.num_values == num_values
|
||||
# TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
|
||||
# method, missing distinct_count is represented as zero instead of None
|
||||
assert stat.distinct_count == distinct_count
|
||||
assert stat.physical_type == physical_type
|
||||
|
||||
|
||||
def _close(type, left, right):
|
||||
if type == pa.float32():
|
||||
return abs(left - right) < 1E-7
|
||||
elif type == pa.float64():
|
||||
return abs(left - right) < 1E-13
|
||||
else:
|
||||
return left == right
|
||||
|
||||
|
||||
# ARROW-6339
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_raise_on_unset_statistics():
|
||||
df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
|
||||
meta = make_sample_file(pa.Table.from_pandas(df)).metadata
|
||||
|
||||
assert not meta.row_group(0).column(0).statistics.has_min_max
|
||||
assert meta.row_group(0).column(0).statistics.max is None
|
||||
|
||||
|
||||
def test_statistics_convert_logical_types(tempdir):
|
||||
# ARROW-5166, ARROW-4139
|
||||
|
||||
# (min, max, type)
|
||||
cases = [(10, 11164359321221007157, pa.uint64()),
|
||||
(10, 4294967295, pa.uint32()),
|
||||
("ähnlich", "öffentlich", pa.utf8()),
|
||||
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
|
||||
pa.time32('ms')),
|
||||
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
|
||||
pa.time64('us')),
|
||||
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
|
||||
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
|
||||
pa.timestamp('ms')),
|
||||
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
|
||||
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
|
||||
pa.timestamp('us')),
|
||||
(datetime.date(2019, 6, 24),
|
||||
datetime.date(2019, 6, 25),
|
||||
pa.date32()),
|
||||
(decimal.Decimal("20.123"),
|
||||
decimal.Decimal("20.124"),
|
||||
pa.decimal128(12, 5))]
|
||||
|
||||
for i, (min_val, max_val, typ) in enumerate(cases):
|
||||
t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
|
||||
['col'])
|
||||
path = str(tempdir / f'example{i}.parquet')
|
||||
pq.write_table(t, path, version='2.6')
|
||||
pf = pq.ParquetFile(path)
|
||||
stats = pf.metadata.row_group(0).column(0).statistics
|
||||
assert stats.min == min_val
|
||||
assert stats.max == max_val
|
||||
|
||||
|
||||
def test_parquet_write_disable_statistics(tempdir):
|
||||
table = pa.Table.from_pydict(
|
||||
OrderedDict([
|
||||
('a', pa.array([1, 2, 3])),
|
||||
('b', pa.array(['a', 'b', 'c']))
|
||||
])
|
||||
)
|
||||
_write_table(table, tempdir / 'data.parquet')
|
||||
meta = pq.read_metadata(tempdir / 'data.parquet')
|
||||
for col in [0, 1]:
|
||||
cc = meta.row_group(0).column(col)
|
||||
assert cc.is_stats_set is True
|
||||
assert cc.statistics is not None
|
||||
|
||||
_write_table(table, tempdir / 'data2.parquet', write_statistics=False)
|
||||
meta = pq.read_metadata(tempdir / 'data2.parquet')
|
||||
for col in [0, 1]:
|
||||
cc = meta.row_group(0).column(col)
|
||||
assert cc.is_stats_set is False
|
||||
assert cc.statistics is None
|
||||
|
||||
_write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
|
||||
meta = pq.read_metadata(tempdir / 'data3.parquet')
|
||||
cc_a = meta.row_group(0).column(0)
|
||||
cc_b = meta.row_group(0).column(1)
|
||||
assert cc_a.is_stats_set is True
|
||||
assert cc_b.is_stats_set is False
|
||||
assert cc_a.statistics is not None
|
||||
assert cc_b.statistics is None
|
||||
|
||||
|
||||
def test_parquet_sorting_column():
|
||||
sorting_col = pq.SortingColumn(10)
|
||||
assert sorting_col.to_dict() == {
|
||||
'column_index': 10,
|
||||
'descending': False,
|
||||
'nulls_first': False
|
||||
}
|
||||
|
||||
sorting_col = pq.SortingColumn(0, descending=True, nulls_first=True)
|
||||
assert sorting_col.to_dict() == {
|
||||
'column_index': 0,
|
||||
'descending': True,
|
||||
'nulls_first': True
|
||||
}
|
||||
|
||||
schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
|
||||
sorting_cols = (
|
||||
pq.SortingColumn(1, descending=True),
|
||||
pq.SortingColumn(0, descending=False),
|
||||
)
|
||||
sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_cols)
|
||||
assert sort_order == (('b', "descending"), ('a', "ascending"))
|
||||
assert null_placement == "at_end"
|
||||
|
||||
sorting_cols_roundtripped = pq.SortingColumn.from_ordering(
|
||||
schema, sort_order, null_placement)
|
||||
assert sorting_cols_roundtripped == sorting_cols
|
||||
|
||||
sorting_cols = pq.SortingColumn.from_ordering(
|
||||
schema, ('a', ('b', "descending")), null_placement="at_start")
|
||||
expected = (
|
||||
pq.SortingColumn(0, descending=False, nulls_first=True),
|
||||
pq.SortingColumn(1, descending=True, nulls_first=True),
|
||||
)
|
||||
assert sorting_cols == expected
|
||||
|
||||
# Conversions handle empty tuples
|
||||
empty_sorting_cols = pq.SortingColumn.from_ordering(schema, ())
|
||||
assert empty_sorting_cols == ()
|
||||
|
||||
assert pq.SortingColumn.to_ordering(schema, ()) == ((), "at_end")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pq.SortingColumn.from_ordering(schema, (("a", "not a valid sort order")))
|
||||
|
||||
with pytest.raises(ValueError, match="inconsistent null placement"):
|
||||
sorting_cols = (
|
||||
pq.SortingColumn(1, nulls_first=True),
|
||||
pq.SortingColumn(0, nulls_first=False),
|
||||
)
|
||||
pq.SortingColumn.to_ordering(schema, sorting_cols)
|
||||
|
||||
|
||||
def test_parquet_sorting_column_nested():
|
||||
schema = pa.schema({
|
||||
'a': pa.struct([('x', pa.int64()), ('y', pa.int64())]),
|
||||
'b': pa.int64()
|
||||
})
|
||||
|
||||
sorting_columns = [
|
||||
pq.SortingColumn(0, descending=True), # a.x
|
||||
pq.SortingColumn(2, descending=False) # b
|
||||
]
|
||||
|
||||
sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_columns)
|
||||
assert null_placement == "at_end"
|
||||
assert len(sort_order) == 2
|
||||
assert sort_order[0] == ("a.x", "descending")
|
||||
assert sort_order[1] == ("b", "ascending")
|
||||
|
||||
|
||||
def test_parquet_file_sorting_columns():
|
||||
table = pa.table({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
|
||||
|
||||
sorting_columns = (
|
||||
pq.SortingColumn(column_index=0, descending=True, nulls_first=True),
|
||||
pq.SortingColumn(column_index=1, descending=False),
|
||||
)
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, sorting_columns=sorting_columns)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
|
||||
# Can retrieve sorting columns from metadata
|
||||
metadata = pq.read_metadata(reader)
|
||||
assert sorting_columns == metadata.row_group(0).sorting_columns
|
||||
|
||||
metadata_dict = metadata.to_dict()
|
||||
assert metadata_dict.get('num_columns') == 2
|
||||
assert metadata_dict.get('num_rows') == 3
|
||||
assert metadata_dict.get('num_row_groups') == 1
|
||||
|
||||
|
||||
def test_field_id_metadata():
|
||||
# ARROW-7080
|
||||
field_id = b'PARQUET:field_id'
|
||||
inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
|
||||
middle = pa.field('middle', pa.struct(
|
||||
[inner]), metadata={field_id: b'101'})
|
||||
fields = [
|
||||
pa.field('basic', pa.int32(), metadata={
|
||||
b'other': b'abc', field_id: b'1'}),
|
||||
pa.field(
|
||||
'list',
|
||||
pa.list_(pa.field('list-inner', pa.int32(),
|
||||
metadata={field_id: b'10'})),
|
||||
metadata={field_id: b'11'}),
|
||||
pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
|
||||
pa.field('no-metadata', pa.int32()),
|
||||
pa.field('non-integral-field-id', pa.int32(),
|
||||
metadata={field_id: b'xyz'}),
|
||||
pa.field('negative-field-id', pa.int32(),
|
||||
metadata={field_id: b'-1000'})
|
||||
]
|
||||
arrs = [[] for _ in fields]
|
||||
table = pa.table(arrs, schema=pa.schema(fields))
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
|
||||
pf = pq.ParquetFile(pa.BufferReader(contents))
|
||||
schema = pf.schema_arrow
|
||||
|
||||
assert schema[0].metadata[field_id] == b'1'
|
||||
assert schema[0].metadata[b'other'] == b'abc'
|
||||
|
||||
list_field = schema[1]
|
||||
assert list_field.metadata[field_id] == b'11'
|
||||
|
||||
list_item_field = list_field.type.value_field
|
||||
assert list_item_field.metadata[field_id] == b'10'
|
||||
|
||||
struct_field = schema[2]
|
||||
assert struct_field.metadata[field_id] == b'102'
|
||||
|
||||
struct_middle_field = struct_field.type[0]
|
||||
assert struct_middle_field.metadata[field_id] == b'101'
|
||||
|
||||
struct_inner_field = struct_middle_field.type[0]
|
||||
assert struct_inner_field.metadata[field_id] == b'100'
|
||||
|
||||
assert schema[3].metadata is None
|
||||
# Invalid input is passed through (ok) but does not
|
||||
# have field_id in parquet (not tested)
|
||||
assert schema[4].metadata[field_id] == b'xyz'
|
||||
assert schema[5].metadata[field_id] == b'-1000'
|
||||
|
||||
|
||||
def test_parquet_file_page_index():
|
||||
for write_page_index in (False, True):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, write_page_index=write_page_index)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
|
||||
# Can retrieve sorting columns from metadata
|
||||
metadata = pq.read_metadata(reader)
|
||||
cc = metadata.row_group(0).column(0)
|
||||
assert cc.has_offset_index is write_page_index
|
||||
assert cc.has_column_index is write_page_index
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multi_dataset_metadata(tempdir):
|
||||
filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
|
||||
metapath = str(tempdir / "_metadata")
|
||||
|
||||
# create a test dataset
|
||||
df = pd.DataFrame({
|
||||
'one': [1, 2, 3],
|
||||
'two': [-1, -2, -3],
|
||||
'three': [[1, 2], [2, 3], [3, 4]],
|
||||
})
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
# write dataset twice and collect/merge metadata
|
||||
_meta = None
|
||||
for filename in filenames:
|
||||
meta = []
|
||||
pq.write_table(table, str(tempdir / filename),
|
||||
metadata_collector=meta)
|
||||
meta[0].set_file_path(filename)
|
||||
if _meta is None:
|
||||
_meta = meta[0]
|
||||
else:
|
||||
_meta.append_row_groups(meta[0])
|
||||
|
||||
# Write merged metadata-only file
|
||||
with open(metapath, "wb") as f:
|
||||
_meta.write_metadata_file(f)
|
||||
|
||||
# Read back the metadata
|
||||
meta = pq.read_metadata(metapath)
|
||||
md = meta.to_dict()
|
||||
_md = _meta.to_dict()
|
||||
for key in _md:
|
||||
if key != 'serialized_size':
|
||||
assert _md[key] == md[key]
|
||||
assert _md['num_columns'] == 3
|
||||
assert _md['num_rows'] == 6
|
||||
assert _md['num_row_groups'] == 2
|
||||
assert _md['serialized_size'] == 0
|
||||
assert md['serialized_size'] > 0
|
||||
|
||||
|
||||
def test_metadata_hashing(tempdir):
|
||||
path1 = str(tempdir / "metadata1")
|
||||
schema1 = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
pq.write_metadata(schema1, path1)
|
||||
parquet_meta1 = pq.read_metadata(path1)
|
||||
|
||||
# Same as 1, just different path
|
||||
path2 = str(tempdir / "metadata2")
|
||||
schema2 = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
pq.write_metadata(schema2, path2)
|
||||
parquet_meta2 = pq.read_metadata(path2)
|
||||
|
||||
# different schema
|
||||
path3 = str(tempdir / "metadata3")
|
||||
schema3 = pa.schema([("a", "int64"), ("b", "float32")])
|
||||
pq.write_metadata(schema3, path3)
|
||||
parquet_meta3 = pq.read_metadata(path3)
|
||||
|
||||
# Deterministic
|
||||
assert hash(parquet_meta1) == hash(parquet_meta1) # equal w/ same instance
|
||||
assert hash(parquet_meta1) == hash(parquet_meta2) # equal w/ different instance
|
||||
|
||||
# Not the same as other metadata with different schema
|
||||
assert hash(parquet_meta1) != hash(parquet_meta3)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
|
||||
def test_write_metadata(tempdir):
|
||||
path = str(tempdir / "metadata")
|
||||
schema = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
|
||||
# write a pyarrow schema
|
||||
pq.write_metadata(schema, path)
|
||||
parquet_meta = pq.read_metadata(path)
|
||||
schema_as_arrow = parquet_meta.schema.to_arrow_schema()
|
||||
assert schema_as_arrow.equals(schema)
|
||||
|
||||
# ARROW-8980: Check that the ARROW:schema metadata key was removed
|
||||
if schema_as_arrow.metadata:
|
||||
assert b'ARROW:schema' not in schema_as_arrow.metadata
|
||||
|
||||
# pass through writer keyword arguments
|
||||
for version in ["1.0", "2.4", "2.6"]:
|
||||
pq.write_metadata(schema, path, version=version)
|
||||
parquet_meta = pq.read_metadata(path)
|
||||
# The version is stored as a single integer in the Parquet metadata,
|
||||
# so it cannot correctly express dotted format versions
|
||||
expected_version = "1.0" if version == "1.0" else "2.6"
|
||||
assert parquet_meta.format_version == expected_version
|
||||
|
||||
# metadata_collector: list of FileMetaData objects
|
||||
table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
|
||||
pq.write_table(table, tempdir / "data.parquet")
|
||||
parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
|
||||
pq.write_metadata(
|
||||
schema, path, metadata_collector=[parquet_meta, parquet_meta]
|
||||
)
|
||||
parquet_meta_mult = pq.read_metadata(path)
|
||||
assert parquet_meta_mult.num_row_groups == 2
|
||||
|
||||
# append metadata with different schema raises an error
|
||||
msg = ("AppendRowGroups requires equal schemas.\n"
|
||||
"The two columns with index 0 differ.")
|
||||
with pytest.raises(RuntimeError, match=msg):
|
||||
pq.write_metadata(
|
||||
pa.schema([("a", "int32"), ("b", "null")]),
|
||||
path, metadata_collector=[parquet_meta, parquet_meta]
|
||||
)
|
||||
|
||||
|
||||
def test_table_large_metadata():
|
||||
# ARROW-8694
|
||||
my_schema = pa.schema([pa.field('f0', 'double')],
|
||||
metadata={'large': 'x' * 10000000})
|
||||
|
||||
table = pa.table([range(10)], schema=my_schema)
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_compare_schemas():
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
fileh = make_sample_file(df)
|
||||
fileh2 = make_sample_file(df)
|
||||
fileh3 = make_sample_file(df[df.columns[::2]])
|
||||
|
||||
# ParquetSchema
|
||||
assert isinstance(fileh.schema, pq.ParquetSchema)
|
||||
assert fileh.schema.equals(fileh.schema)
|
||||
assert fileh.schema == fileh.schema
|
||||
assert fileh.schema.equals(fileh2.schema)
|
||||
assert fileh.schema == fileh2.schema
|
||||
assert fileh.schema != 'arbitrary object'
|
||||
assert not fileh.schema.equals(fileh3.schema)
|
||||
assert fileh.schema != fileh3.schema
|
||||
|
||||
# ColumnSchema
|
||||
assert isinstance(fileh.schema[0], pq.ColumnSchema)
|
||||
assert fileh.schema[0].equals(fileh.schema[0])
|
||||
assert fileh.schema[0] == fileh.schema[0]
|
||||
assert not fileh.schema[0].equals(fileh.schema[1])
|
||||
assert fileh.schema[0] != fileh.schema[1]
|
||||
assert fileh.schema[0] != 'arbitrary object'
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_schema(tempdir):
|
||||
N = 100
|
||||
df = pd.DataFrame({
|
||||
'index': np.arange(N),
|
||||
'values': np.random.randn(N)
|
||||
}, columns=['index', 'values'])
|
||||
|
||||
data_path = tempdir / 'test.parquet'
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_write_table(table, data_path)
|
||||
|
||||
read1 = pq.read_schema(data_path)
|
||||
read2 = pq.read_schema(data_path, memory_map=True)
|
||||
assert table.schema.equals(read1)
|
||||
assert table.schema.equals(read2)
|
||||
|
||||
assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
|
||||
|
||||
|
||||
def test_parquet_metadata_empty_to_dict(tempdir):
|
||||
# https://issues.apache.org/jira/browse/ARROW-10146
|
||||
table = pa.table({"a": pa.array([], type="int64")})
|
||||
pq.write_table(table, tempdir / "data.parquet")
|
||||
metadata = pq.read_metadata(tempdir / "data.parquet")
|
||||
# ensure this doesn't error / statistics set to None
|
||||
metadata_dict = metadata.to_dict()
|
||||
assert len(metadata_dict["row_groups"]) == 1
|
||||
assert len(metadata_dict["row_groups"][0]["columns"]) == 1
|
||||
assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_metadata_exceeds_message_size():
|
||||
# ARROW-13655: Thrift may enable a default message size that limits
|
||||
# the size of Parquet metadata that can be written.
|
||||
NCOLS = 1000
|
||||
NREPEATS = 4000
|
||||
|
||||
table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})
|
||||
|
||||
with pa.BufferOutputStream() as out:
|
||||
pq.write_table(table, out)
|
||||
buf = out.getvalue()
|
||||
|
||||
original_metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
for i in range(NREPEATS):
|
||||
metadata.append_row_groups(original_metadata)
|
||||
|
||||
with pa.BufferOutputStream() as out:
|
||||
metadata.write_metadata_file(out)
|
||||
buf = out.getvalue()
|
||||
|
||||
metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
|
||||
|
||||
def test_metadata_schema_filesystem(tempdir):
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
|
||||
# URI writing to local file.
|
||||
fname = "data.parquet"
|
||||
file_path = str(tempdir / fname)
|
||||
file_uri = 'file:///' + file_path
|
||||
|
||||
pq.write_table(table, file_path)
|
||||
|
||||
# Get expected `metadata` from path.
|
||||
metadata = pq.read_metadata(tempdir / fname)
|
||||
schema = table.schema
|
||||
|
||||
assert pq.read_metadata(file_uri).equals(metadata)
|
||||
assert pq.read_metadata(
|
||||
file_path, filesystem=LocalFileSystem()).equals(metadata)
|
||||
assert pq.read_metadata(
|
||||
fname, filesystem=f'file:///{tempdir}').equals(metadata)
|
||||
|
||||
assert pq.read_schema(file_uri).equals(schema)
|
||||
assert pq.read_schema(
|
||||
file_path, filesystem=LocalFileSystem()).equals(schema)
|
||||
assert pq.read_schema(
|
||||
fname, filesystem=f'file:///{tempdir}').equals(schema)
|
||||
|
||||
with util.change_cwd(tempdir):
|
||||
# Pass `filesystem` arg
|
||||
assert pq.read_metadata(
|
||||
fname, filesystem=LocalFileSystem()).equals(metadata)
|
||||
|
||||
assert pq.read_schema(
|
||||
fname, filesystem=LocalFileSystem()).equals(schema)
|
||||
|
||||
|
||||
def test_metadata_equals():
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
with pa.BufferOutputStream() as out:
|
||||
pq.write_table(table, out)
|
||||
buf = out.getvalue()
|
||||
|
||||
original_metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
match = "Argument 'other' has incorrect type"
|
||||
with pytest.raises(TypeError, match=match):
|
||||
original_metadata.equals(None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("t1,t2,expected_error", (
|
||||
({'col1': range(10)}, {'col1': range(10)}, None),
|
||||
({'col1': range(10)}, {'col2': range(10)},
|
||||
"The two columns with index 0 differ."),
|
||||
({'col1': range(10), 'col2': range(10)}, {'col3': range(10)},
|
||||
"This schema has 2 columns, other has 1")
|
||||
))
|
||||
def test_metadata_append_row_groups_diff(t1, t2, expected_error):
|
||||
table1 = pa.table(t1)
|
||||
table2 = pa.table(t2)
|
||||
|
||||
buf1 = io.BytesIO()
|
||||
buf2 = io.BytesIO()
|
||||
pq.write_table(table1, buf1)
|
||||
pq.write_table(table2, buf2)
|
||||
buf1.seek(0)
|
||||
buf2.seek(0)
|
||||
|
||||
meta1 = pq.ParquetFile(buf1).metadata
|
||||
meta2 = pq.ParquetFile(buf2).metadata
|
||||
|
||||
if expected_error:
|
||||
# Error clearly defines it's happening at append row groups call
|
||||
prefix = "AppendRowGroups requires equal schemas.\n"
|
||||
with pytest.raises(RuntimeError, match=prefix + expected_error):
|
||||
meta1.append_row_groups(meta2)
|
||||
else:
|
||||
meta1.append_row_groups(meta2)
|
||||
|
||||
|
||||
@pytest.mark.s3
|
||||
def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs):
|
||||
s3_fs, s3_path = s3_example_s3fs
|
||||
|
||||
meta1 = tempdir / "meta1"
|
||||
meta2 = tempdir / "meta2"
|
||||
meta3 = tempdir / "meta3"
|
||||
meta4 = tempdir / "meta4"
|
||||
meta5 = f"{s3_path}/meta5"
|
||||
|
||||
table = pa.table({"col": range(5)})
|
||||
|
||||
# plain local path
|
||||
pq.write_metadata(table.schema, meta1, [])
|
||||
|
||||
# Used the localfilesystem to resolve opening an output stream
|
||||
pq.write_metadata(table.schema, meta2, [], filesystem=LocalFileSystem())
|
||||
|
||||
# Can resolve local file URI
|
||||
pq.write_metadata(table.schema, meta3.as_uri(), [])
|
||||
|
||||
# Take a file-like obj all the way thru?
|
||||
with meta4.open('wb+') as meta4_stream:
|
||||
pq.write_metadata(table.schema, meta4_stream, [])
|
||||
|
||||
# S3FileSystem
|
||||
pq.write_metadata(table.schema, meta5, [], filesystem=s3_fs)
|
||||
|
||||
assert meta1.read_bytes() == meta2.read_bytes() \
|
||||
== meta3.read_bytes() == meta4.read_bytes() \
|
||||
== s3_fs.open(meta5).read()
|
||||
|
||||
|
||||
def test_column_chunk_key_value_metadata(parquet_test_datadir):
|
||||
metadata = pq.read_metadata(parquet_test_datadir /
|
||||
'column_chunk_key_value_metadata.parquet')
|
||||
key_value_metadata1 = metadata.row_group(0).column(0).metadata
|
||||
assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''}
|
||||
key_value_metadata2 = metadata.row_group(0).column(1).metadata
|
||||
assert key_value_metadata2 is None
|
||||
|
||||
|
||||
def test_internal_class_instantiation():
|
||||
def msg(c):
|
||||
return f"Do not call {c}'s constructor directly"
|
||||
|
||||
with pytest.raises(TypeError, match=msg("Statistics")):
|
||||
pq.Statistics()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
|
||||
pq.ParquetLogicalType()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
|
||||
pq.ColumnChunkMetaData()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
|
||||
pq.RowGroupMetaData()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("FileMetaData")):
|
||||
pq.FileMetaData()
|
||||
@@ -0,0 +1,680 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
import json
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
|
||||
from pyarrow.util import guid
|
||||
from pyarrow.vendored.version import Version
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
|
||||
_write_table)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
|
||||
alltypes_sample)
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_custom_metadata(tempdir):
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert b'pandas' in arrow_table.schema.metadata
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
metadata = pq.read_metadata(filename).metadata
|
||||
assert b'pandas' in metadata
|
||||
|
||||
js = json.loads(metadata[b'pandas'].decode('utf8'))
|
||||
assert js['index_columns'] == [{'kind': 'range',
|
||||
'name': None,
|
||||
'start': 0, 'stop': 10000,
|
||||
'step': 1}]
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
|
||||
# ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
|
||||
schema = pa.schema([
|
||||
pa.field('int', pa.int16()),
|
||||
pa.field('float', pa.float32()),
|
||||
pa.field('string', pa.string())
|
||||
])
|
||||
df1 = pd.DataFrame({
|
||||
'int': np.arange(3, dtype=np.uint8),
|
||||
'float': np.arange(3, dtype=np.float32),
|
||||
'string': ['ABBA', 'EDDA', 'ACDC']
|
||||
})
|
||||
df2 = pd.DataFrame({
|
||||
'int': [4, 5],
|
||||
'float': [1.1, None],
|
||||
'string': [None, None]
|
||||
})
|
||||
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
|
||||
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
|
||||
|
||||
assert not table1.schema.equals(table2.schema, check_metadata=True)
|
||||
assert table1.schema.equals(table2.schema)
|
||||
|
||||
writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
|
||||
writer.write_table(table1)
|
||||
writer.write_table(table2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_attributes_metadata_persistence(tempdir):
|
||||
# GH-45382: Add support for pandas DataFrame.attrs
|
||||
# During the .parquet file writing, the attrs are serialised into json
|
||||
# along with the rest of the pandas.DataFrame metadata.
|
||||
|
||||
filename = tempdir / "metadata_persistence.parquet"
|
||||
df = alltypes_sample(size=10000)
|
||||
df.attrs = {
|
||||
'float16': 'half-precision',
|
||||
'float32': 'single precision',
|
||||
'float64': 'double precision',
|
||||
'desciption': 'Attributes Persistence Test DataFrame',
|
||||
}
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
assert b'attributes' in table.schema.metadata[b'pandas']
|
||||
|
||||
_write_table(table, filename)
|
||||
metadata = pq.read_metadata(filename).metadata
|
||||
js = json.loads(metadata[b'pandas'].decode('utf8'))
|
||||
assert 'attributes' in js
|
||||
assert js['attributes'] == df.attrs
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_column_multiindex(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
df.columns = pd.MultiIndex.from_tuples(
|
||||
list(zip(df.columns, df.columns[::-1])),
|
||||
names=['level_1', 'level_2']
|
||||
)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
table_read = pq.read_pandas(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_2_roundtrip_read_pandas_no_index_written(tempdir):
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
js = arrow_table.schema.pandas_metadata
|
||||
assert not js['index_columns']
|
||||
# ARROW-2170
|
||||
# While index_columns should be empty, columns needs to be filled still.
|
||||
assert js['columns']
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
table_read = pq.read_pandas(filename)
|
||||
|
||||
js = table_read.schema.pandas_metadata
|
||||
assert not js['index_columns']
|
||||
|
||||
read_metadata = table_read.schema.metadata
|
||||
assert arrow_table.schema.metadata == read_metadata
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_native_file_roundtrip():
|
||||
df = _test_dataframe(10000)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = _read_table(reader).to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_column_subset():
|
||||
df = _test_dataframe(10000)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = pq.read_pandas(
|
||||
reader, columns=['strings', 'uint8'],
|
||||
).to_pandas()
|
||||
tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_empty_roundtrip():
|
||||
df = _test_dataframe(0)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = _read_table(reader).to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_can_write_nested_data():
|
||||
data = {
|
||||
"agg_col": [
|
||||
{"page_type": 1},
|
||||
{"record_type": 1},
|
||||
{"non_consecutive_home": 0},
|
||||
],
|
||||
"uid_first": "1001"
|
||||
}
|
||||
df = pd.DataFrame(data=data)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
# This succeeds under V2
|
||||
_write_table(arrow_table, imos)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_pyfile_roundtrip(tempdir):
|
||||
filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
|
||||
size = 5
|
||||
df = pd.DataFrame({
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'strings': ['foo', 'bar', None, 'baz', 'qux']
|
||||
})
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
with filename.open('wb') as f:
|
||||
_write_table(arrow_table, f, version="2.6")
|
||||
|
||||
data = io.BytesIO(filename.read_bytes())
|
||||
|
||||
table_read = _read_table(data)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_configuration_options(tempdir):
|
||||
size = 10000
|
||||
np.random.seed(0)
|
||||
df = pd.DataFrame({
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0
|
||||
})
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
for use_dictionary in [True, False]:
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
use_dictionary=use_dictionary)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
for write_statistics in [True, False]:
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
write_statistics=write_statistics)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
|
||||
if (compression != 'NONE' and
|
||||
not pa.lib.Codec.is_available(compression)):
|
||||
continue
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
compression=compression)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_spark_flavor_preserves_pandas_metadata():
|
||||
df = _test_dataframe(size=100)
|
||||
df.index = np.arange(0, 10 * len(df), 10)
|
||||
df.index.name = 'foo'
|
||||
|
||||
result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'})
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_index_column_name_duplicate(tempdir):
|
||||
data = {
|
||||
'close': {
|
||||
pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
|
||||
pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
|
||||
},
|
||||
'time': {
|
||||
pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
|
||||
'2017-06-30 01:31:00'
|
||||
),
|
||||
pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
|
||||
'2017-06-30 01:32:00'
|
||||
),
|
||||
}
|
||||
}
|
||||
path = str(tempdir / 'data.parquet')
|
||||
|
||||
# Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
|
||||
# so we need to cast the pandas dtype. Pandas v1 will always silently
|
||||
# coerce to [ns] due to lack of non-[ns] support.
|
||||
dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time', drop=False)
|
||||
|
||||
tdfx = pa.Table.from_pandas(dfx)
|
||||
_write_table(tdfx, path)
|
||||
arrow_table = _read_table(path)
|
||||
result_df = arrow_table.to_pandas()
|
||||
tm.assert_frame_equal(result_df, dfx)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multiindex_duplicate_values(tempdir):
|
||||
num_rows = 3
|
||||
numbers = list(range(num_rows))
|
||||
index = pd.MultiIndex.from_arrays(
|
||||
[['foo', 'foo', 'bar'], numbers],
|
||||
names=['foobar', 'some_numbers'],
|
||||
)
|
||||
|
||||
df = pd.DataFrame({'numbers': numbers}, index=index)
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'dup_multi_index_levels.parquet'
|
||||
|
||||
_write_table(table, filename)
|
||||
result_table = _read_table(filename)
|
||||
assert table.equals(result_table)
|
||||
|
||||
result_df = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result_df, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_naming(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
|
||||
index_col=None, header=0, engine='python')
|
||||
table = _read_table(datadir / 'v0.7.1.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_multi_level_named(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(
|
||||
io.BytesIO(expected_string), sep=r'\s{2,}',
|
||||
index_col=['cut', 'color', 'clarity'],
|
||||
header=0, engine='python'
|
||||
).sort_index()
|
||||
|
||||
table = _read_table(datadir / 'v0.7.1.all-named-index.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_multi_level_some_named(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(
|
||||
io.BytesIO(expected_string),
|
||||
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
|
||||
header=0, engine='python'
|
||||
).sort_index()
|
||||
expected.index = expected.index.set_names(['cut', None, 'clarity'])
|
||||
|
||||
table = _read_table(datadir / 'v0.7.1.some-named-index.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_column_metadata_handling(datadir):
|
||||
if Version("2.2.0") <= Version(pd.__version__):
|
||||
# TODO: regression in pandas
|
||||
# https://github.com/pandas-dev/pandas/issues/56775
|
||||
pytest.skip("Regression in pandas 2.2.0")
|
||||
expected = pd.DataFrame(
|
||||
{'a': [1, 2, 3], 'b': [.1, .2, .3],
|
||||
'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
|
||||
expected.index = pd.MultiIndex.from_arrays(
|
||||
[['a', 'b', 'c'],
|
||||
pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
|
||||
names=['index', None])
|
||||
|
||||
path = datadir / 'v0.7.1.column-metadata-handling.parquet'
|
||||
table = _read_table(path)
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
table = _read_table(
|
||||
path, columns=['a'])
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categorical_index_survives_roundtrip():
|
||||
# ARROW-3652, addressed by ARROW-3246
|
||||
df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
|
||||
df['c1'] = df['c1'].astype('category')
|
||||
df = df.set_index(['c1'])
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
bos = pa.BufferOutputStream()
|
||||
pq.write_table(table, bos)
|
||||
ref_df = pq.read_pandas(bos.getvalue()).to_pandas()
|
||||
assert isinstance(ref_df.index, pd.CategoricalIndex)
|
||||
assert ref_df.index.equals(df.index)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categorical_order_survives_roundtrip():
|
||||
# ARROW-6302
|
||||
df = pd.DataFrame({"a": pd.Categorical(
|
||||
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
bos = pa.BufferOutputStream()
|
||||
pq.write_table(table, bos)
|
||||
|
||||
contents = bos.getvalue()
|
||||
result = pq.read_pandas(contents).to_pandas()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_categorical_na_type_row_groups():
|
||||
# ARROW-5085
|
||||
df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
|
||||
df_category = df.astype({"col": "category", "int": "category"})
|
||||
table = pa.Table.from_pandas(df)
|
||||
table_cat = pa.Table.from_pandas(df_category)
|
||||
buf = pa.BufferOutputStream()
|
||||
|
||||
# it works
|
||||
pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
|
||||
result = pq.read_table(buf.getvalue())
|
||||
|
||||
# Result is non-categorical
|
||||
assert result[0].equals(table[0])
|
||||
assert result[1].equals(table[1])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_categorical_roundtrip():
|
||||
# ARROW-5480, this was enabled by ARROW-3246
|
||||
|
||||
# Have one of the categories unobserved and include a null (-1)
|
||||
codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
|
||||
categories = ['foo', 'bar', 'baz']
|
||||
df = pd.DataFrame({'x': pd.Categorical.from_codes(
|
||||
codes, categories=categories)})
|
||||
|
||||
buf = pa.BufferOutputStream()
|
||||
pq.write_table(pa.table(df), buf)
|
||||
|
||||
result = pq.read_table(buf.getvalue()).to_pandas()
|
||||
assert result.x.dtype == 'category'
|
||||
assert (result.x.cat.categories == categories).all()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categories_with_string_pyarrow_dtype(tempdir):
|
||||
# gh-33727: writing to parquet should not fail
|
||||
if Version(pd.__version__) < Version("1.3.0"):
|
||||
pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")
|
||||
|
||||
df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
|
||||
df1 = df1.astype("category")
|
||||
|
||||
df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]})
|
||||
df2 = df2.astype("category")
|
||||
|
||||
# categories should be converted to pa.Array
|
||||
assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist()
|
||||
assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array(
|
||||
df2["x"].cat.categories.values).to_pylist()
|
||||
|
||||
path = str(tempdir / 'cat.parquet')
|
||||
pq.write_table(pa.table(df1), path)
|
||||
result = pq.read_table(path).to_pandas()
|
||||
|
||||
tm.assert_frame_equal(result, df2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir):
|
||||
df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
|
||||
df['col'] = df['col'].astype("Int64")
|
||||
table = pa.table(df)
|
||||
|
||||
pq.write_to_dataset(
|
||||
table, str(tempdir / "case1"), partition_cols=['part'],
|
||||
)
|
||||
result = pq.read_table(str(tempdir / "case1")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
pq.write_to_dataset(table, str(tempdir / "case2"))
|
||||
result = pq.read_table(str(tempdir / "case2")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_to_dataset_pandas_preserve_index(tempdir):
|
||||
# ARROW-8251 - preserve pandas index in roundtrip
|
||||
|
||||
df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
|
||||
df.index = pd.Index(['a', 'b', 'c'], name="idx")
|
||||
table = pa.table(df)
|
||||
df_cat = df[["col", "part"]].copy()
|
||||
df_cat["part"] = df_cat["part"].astype("category")
|
||||
|
||||
pq.write_to_dataset(
|
||||
table, str(tempdir / "case1"), partition_cols=['part'],
|
||||
)
|
||||
result = pq.read_table(str(tempdir / "case1")).to_pandas()
|
||||
tm.assert_frame_equal(result, df_cat)
|
||||
|
||||
pq.write_to_dataset(table, str(tempdir / "case2"))
|
||||
result = pq.read_table(str(tempdir / "case2")).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('preserve_index', [True, False, None])
|
||||
@pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
|
||||
def test_dataset_read_pandas_common_metadata(
|
||||
tempdir, preserve_index, metadata_fname
|
||||
):
|
||||
# ARROW-1103
|
||||
nfiles = 5
|
||||
size = 5
|
||||
|
||||
dirpath = tempdir / guid()
|
||||
dirpath.mkdir()
|
||||
|
||||
test_data = []
|
||||
frames = []
|
||||
paths = []
|
||||
for i in range(nfiles):
|
||||
df = _test_dataframe(size, seed=i)
|
||||
df.index = pd.Index(
|
||||
np.arange(i * size, (i + 1) * size, dtype="int64"), name='index'
|
||||
)
|
||||
|
||||
path = dirpath / f'{i}.parquet'
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=preserve_index)
|
||||
|
||||
# Obliterate metadata
|
||||
table = table.replace_schema_metadata(None)
|
||||
assert table.schema.metadata is None
|
||||
|
||||
_write_table(table, path)
|
||||
test_data.append(table)
|
||||
frames.append(df)
|
||||
paths.append(path)
|
||||
|
||||
# Write _metadata common file
|
||||
table_for_metadata = pa.Table.from_pandas(
|
||||
df, preserve_index=preserve_index
|
||||
)
|
||||
pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
|
||||
|
||||
dataset = pq.ParquetDataset(dirpath)
|
||||
columns = ['uint8', 'strings']
|
||||
result = dataset.read_pandas(columns=columns).to_pandas()
|
||||
expected = pd.concat([x[columns] for x in frames])
|
||||
expected.index.name = (
|
||||
df.index.name if preserve_index is not False else None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_passthrough_keywords(tempdir):
|
||||
# ARROW-11464 - previously not all keywords were passed through (such as
|
||||
# the filesystem keyword)
|
||||
df = pd.DataFrame({'a': [1, 2, 3]})
|
||||
|
||||
filename = tempdir / 'data.parquet'
|
||||
_write_table(df, filename)
|
||||
|
||||
result = pq.read_pandas(
|
||||
'data.parquet',
|
||||
filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
|
||||
)
|
||||
assert result.equals(pa.table(df))
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_map_fields(tempdir):
|
||||
# ARROW-10140 - table created from Pandas with mapping fields
|
||||
df = pd.DataFrame({
|
||||
'col1': pd.Series([
|
||||
[('id', 'something'), ('value2', 'else')],
|
||||
[('id', 'something2'), ('value', 'else2')],
|
||||
]),
|
||||
'col2': pd.Series(['foo', 'bar'])
|
||||
})
|
||||
|
||||
filename = tempdir / 'data.parquet'
|
||||
|
||||
udt = pa.map_(pa.string(), pa.string())
|
||||
schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
|
||||
arrow_table = pa.Table.from_pandas(df, schema)
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
result = pq.read_pandas(filename).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
@@ -0,0 +1,446 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
from unittest import mock
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pass_separate_metadata():
|
||||
# ARROW-471
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
metadata = pq.read_metadata(buf)
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
fileh = pq.ParquetFile(buf, metadata=metadata)
|
||||
|
||||
tm.assert_frame_equal(df, fileh.read().to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_single_row_group():
|
||||
# ARROW-471
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.num_row_groups == K
|
||||
|
||||
row_groups = [pf.read_row_group(i) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df, result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_single_row_group_with_column_subset():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
cols = list(df.columns[:2])
|
||||
row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
# ARROW-4267: Selection of duplicate columns still leads to these columns
|
||||
# being read uniquely.
|
||||
row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_multiple_row_groups():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.num_row_groups == K
|
||||
|
||||
result = pf.read_row_groups(range(K))
|
||||
tm.assert_frame_equal(df, result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_multiple_row_groups_with_column_subset():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
cols = list(df.columns[:2])
|
||||
result = pf.read_row_groups(range(K), columns=cols)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
# ARROW-4267: Selection of duplicate columns still leads to these columns
|
||||
# being read uniquely.
|
||||
result = pf.read_row_groups(range(K), columns=cols + cols)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_scan_contents():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.scan_contents() == 10000
|
||||
assert pf.scan_contents(df.columns[:4]) == 10000
|
||||
|
||||
|
||||
def test_parquet_file_pass_directory_instead_of_file(tempdir):
|
||||
# ARROW-7208
|
||||
path = tempdir / 'directory'
|
||||
os.mkdir(str(path))
|
||||
|
||||
msg = f"Cannot open for reading: path '{str(path)}' is a directory"
|
||||
with pytest.raises(IOError) as exc:
|
||||
pq.ParquetFile(path)
|
||||
if exc.errisinstance(PermissionError) and sys.platform == 'win32':
|
||||
return # Windows CI can get a PermissionError here.
|
||||
exc.match(msg)
|
||||
|
||||
|
||||
def test_read_column_invalid_index():
|
||||
table = pa.table([pa.array([4, 5]), pa.array(["foo", "bar"])],
|
||||
names=['ints', 'strs'])
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
f = pq.ParquetFile(bio.getvalue())
|
||||
assert f.reader.read_column(0).to_pylist() == [4, 5]
|
||||
assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
|
||||
for index in (-1, 2):
|
||||
with pytest.raises((ValueError, IndexError)):
|
||||
f.reader.read_column(index)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('batch_size', [300, 1000, 1300])
|
||||
def test_iter_batches_columns_reader(tempdir, batch_size):
|
||||
total_size = 3000
|
||||
chunk_size = 1000
|
||||
# TODO: Add categorical support
|
||||
df = alltypes_sample(size=total_size)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
|
||||
file_ = pq.ParquetFile(filename)
|
||||
for columns in [df.columns[:10], df.columns[10:]]:
|
||||
batches = file_.iter_batches(batch_size=batch_size, columns=columns)
|
||||
batch_starts = range(0, total_size+batch_size, batch_size)
|
||||
for batch, start in zip(batches, batch_starts):
|
||||
end = min(total_size, start + batch_size)
|
||||
tm.assert_frame_equal(
|
||||
batch.to_pandas(),
|
||||
df.iloc[start:end, :].loc[:, columns].reset_index(drop=True)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('chunk_size', [1000])
|
||||
def test_iter_batches_reader(tempdir, chunk_size):
|
||||
df = alltypes_sample(size=10000, categorical=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
|
||||
file_ = pq.ParquetFile(filename)
|
||||
|
||||
def get_all_batches(f):
|
||||
for row_group in range(f.num_row_groups):
|
||||
batches = f.iter_batches(
|
||||
batch_size=900,
|
||||
row_groups=[row_group],
|
||||
)
|
||||
|
||||
for batch in batches:
|
||||
yield batch
|
||||
|
||||
batches = list(get_all_batches(file_))
|
||||
batch_no = 0
|
||||
|
||||
for i in range(file_.num_row_groups):
|
||||
tm.assert_frame_equal(
|
||||
batches[batch_no].to_pandas(),
|
||||
file_.read_row_groups([i]).to_pandas().head(900)
|
||||
)
|
||||
|
||||
batch_no += 1
|
||||
|
||||
tm.assert_frame_equal(
|
||||
batches[batch_no].to_pandas().reset_index(drop=True),
|
||||
file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
|
||||
drop=True
|
||||
)
|
||||
)
|
||||
|
||||
batch_no += 1
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('pre_buffer', [False, True])
|
||||
def test_pre_buffer(pre_buffer):
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
|
||||
assert pf.read().num_rows == N
|
||||
|
||||
|
||||
def test_parquet_file_explicitly_closed(tempdir):
|
||||
"""
|
||||
Unopened files should be closed explicitly after use,
|
||||
and previously opened files should be left open.
|
||||
Applies to read_table, ParquetDataset, and ParquetFile
|
||||
"""
|
||||
# create test parquet file
|
||||
fn = tempdir.joinpath('file.parquet')
|
||||
table = pa.table({'col1': [0, 1], 'col2': [0, 1]})
|
||||
pq.write_table(table, fn)
|
||||
|
||||
# ParquetFile with opened file (will leave open)
|
||||
with open(fn, 'rb') as f:
|
||||
with pq.ParquetFile(f) as p:
|
||||
p.read()
|
||||
assert not f.closed
|
||||
assert not p.closed
|
||||
assert not f.closed # opened input file was not closed
|
||||
assert not p.closed # parquet file obj reports as not closed
|
||||
assert f.closed
|
||||
assert p.closed # parquet file being closed reflects underlying file
|
||||
|
||||
# ParquetFile with unopened file (will close)
|
||||
with pq.ParquetFile(fn) as p:
|
||||
p.read()
|
||||
assert not p.closed
|
||||
assert p.closed # parquet file obj reports as closed
|
||||
|
||||
|
||||
@pytest.mark.s3
|
||||
@pytest.mark.parametrize("use_uri", (True, False))
|
||||
def test_parquet_file_with_filesystem(s3_example_fs, use_uri):
|
||||
s3_fs, s3_uri, s3_path = s3_example_fs
|
||||
|
||||
args = (s3_uri if use_uri else s3_path,)
|
||||
kwargs = {} if use_uri else dict(filesystem=s3_fs)
|
||||
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, s3_path, filesystem=s3_fs)
|
||||
|
||||
parquet_file = pq.ParquetFile(*args, **kwargs)
|
||||
assert parquet_file.read() == table
|
||||
assert not parquet_file.closed
|
||||
parquet_file.close()
|
||||
assert parquet_file.closed
|
||||
|
||||
with pq.ParquetFile(*args, **kwargs) as f:
|
||||
assert f.read() == table
|
||||
assert not f.closed
|
||||
assert f.closed
|
||||
|
||||
|
||||
def test_read_statistics():
|
||||
table = pa.table({"value": pa.array([-1, None, 3])})
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf)
|
||||
buf.seek(0)
|
||||
|
||||
statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
|
||||
assert statistics.null_count == 1
|
||||
assert statistics.distinct_count is None
|
||||
# TODO: add tests for is_distinct_count_exact == None and True
|
||||
# once Python API allows
|
||||
assert statistics.is_distinct_count_exact is False
|
||||
assert statistics.min == -1
|
||||
assert statistics.is_min_exact
|
||||
assert statistics.max == 3
|
||||
assert statistics.is_max_exact
|
||||
assert repr(statistics) == ("arrow.ArrayStatistics<"
|
||||
"null_count=1, distinct_count=None, "
|
||||
"min=-1, is_min_exact=True, "
|
||||
"max=3, is_max_exact=True>")
|
||||
|
||||
|
||||
def test_read_undefined_logical_type(parquet_test_datadir):
|
||||
test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
|
||||
|
||||
table = pq.ParquetFile(test_file).read()
|
||||
assert table.column_names == ["column with known type", "column with unknown type"]
|
||||
assert table["column with unknown type"].to_pylist() == [
|
||||
b"unknown string 1",
|
||||
b"unknown string 2",
|
||||
b"unknown string 3"
|
||||
]
|
||||
|
||||
|
||||
def test_parquet_file_fsspec_support():
|
||||
pytest.importorskip("fsspec")
|
||||
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, "fsspec+memory://example.parquet")
|
||||
table2 = pq.read_table("fsspec+memory://example.parquet")
|
||||
assert table.equals(table2)
|
||||
|
||||
msg = "Unrecognized filesystem type in URI"
|
||||
with pytest.raises(pa.ArrowInvalid, match=msg):
|
||||
pq.read_table("non-existing://example.parquet")
|
||||
|
||||
|
||||
def test_parquet_file_fsspec_support_through_filesystem_argument():
|
||||
try:
|
||||
from fsspec.implementations.memory import MemoryFileSystem
|
||||
except ImportError:
|
||||
pytest.skip("fsspec is not installed, skipping test")
|
||||
|
||||
table = pa.table({"b": range(10)})
|
||||
|
||||
fs = MemoryFileSystem()
|
||||
fs.mkdir("/path/to/prefix", create_parents=True)
|
||||
assert fs.exists("/path/to/prefix")
|
||||
|
||||
fs_str = "fsspec+memory://path/to/prefix"
|
||||
pq.write_table(table, "b.parquet", filesystem=fs_str)
|
||||
table2 = pq.read_table("fsspec+memory://path/to/prefix/b.parquet")
|
||||
assert table.equals(table2)
|
||||
|
||||
|
||||
def test_parquet_file_hugginface_support():
|
||||
try:
|
||||
from fsspec.implementations.memory import MemoryFileSystem
|
||||
except ImportError:
|
||||
pytest.skip("fsspec is not installed, skipping Hugging Face test")
|
||||
|
||||
fake_hf_module = types.ModuleType("huggingface_hub")
|
||||
fake_hf_module.HfFileSystem = MemoryFileSystem
|
||||
with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}):
|
||||
uri = "hf://datasets/apache/arrow/test.parquet"
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, uri)
|
||||
table2 = pq.read_table(uri)
|
||||
assert table.equals(table2)
|
||||
|
||||
|
||||
def test_fsspec_uri_raises_if_fsspec_is_not_available():
|
||||
# sadly cannot patch sys.modules because cython will still be able to import fsspec
|
||||
try:
|
||||
import fsspec # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
pytest.skip("fsspec is available, skipping test")
|
||||
|
||||
msg = re.escape(
|
||||
"`fsspec` is required to handle `fsspec+<filesystem>://` and `hf://` URIs.")
|
||||
with pytest.raises(ImportError, match=msg):
|
||||
pq.read_table("fsspec+memory://example.parquet")
|
||||
|
||||
|
||||
def test_iter_batches_raises_batch_size_zero(tempdir):
|
||||
# See https://github.com/apache/arrow/issues/46811
|
||||
schema = pa.schema([])
|
||||
empty_table = pa.Table.from_batches([], schema=schema)
|
||||
parquet_file_path = tempdir / "empty_file.parquet"
|
||||
pq.write_table(empty_table, parquet_file_path)
|
||||
parquet_file = pq.ParquetFile(parquet_file_path)
|
||||
with pytest.raises(ValueError):
|
||||
parquet_file.iter_batches(batch_size=0)
|
||||
@@ -0,0 +1,449 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow import fs
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
|
||||
_test_table, _range_integers)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_incremental_file_build(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
|
||||
writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
|
||||
frames.append(df.copy())
|
||||
|
||||
writer.close()
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
def test_validate_schema_write_table(tempdir):
|
||||
# ARROW-2926
|
||||
simple_fields = [
|
||||
pa.field('POS', pa.uint32()),
|
||||
pa.field('desc', pa.string())
|
||||
]
|
||||
|
||||
simple_schema = pa.schema(simple_fields)
|
||||
|
||||
# simple_table schema does not match simple_schema
|
||||
simple_from_array = [pa.array([1]), pa.array(['bla'])]
|
||||
simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
|
||||
|
||||
path = tempdir / 'simple_validate_schema.parquet'
|
||||
|
||||
with pq.ParquetWriter(path, simple_schema,
|
||||
version='2.6',
|
||||
compression='snappy', flavor='spark') as w:
|
||||
with pytest.raises(ValueError):
|
||||
w.write_table(simple_table)
|
||||
|
||||
|
||||
def test_parquet_invalid_writer(tempdir):
|
||||
# avoid segfaults with invalid construction
|
||||
with pytest.raises(TypeError):
|
||||
some_schema = pa.schema([pa.field("x", pa.int32())])
|
||||
pq.ParquetWriter(None, some_schema)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
pq.ParquetWriter(tempdir / "some_path", None)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_context_obj(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
|
||||
with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
|
||||
frames.append(df.copy())
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_context_obj_with_exception(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
error_text = 'Artificial Error'
|
||||
|
||||
try:
|
||||
with pq.ParquetWriter(out,
|
||||
arrow_table.schema,
|
||||
version='2.6') as writer:
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
frames.append(df.copy())
|
||||
if i == 5:
|
||||
raise ValueError(error_text)
|
||||
except Exception as e:
|
||||
assert str(e) == error_text
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None,
|
||||
fs.LocalFileSystem(),
|
||||
])
|
||||
def test_parquet_writer_write_wrappers(tempdir, filesystem):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
|
||||
path_table = str(tempdir / 'data_table.parquet')
|
||||
path_batch = str(tempdir / 'data_batch.parquet')
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_table, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path_table).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_batch, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_batch(batch)
|
||||
|
||||
result = _read_table(path_batch).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_table, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write(table)
|
||||
|
||||
result = _read_table(path_table).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_batch, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write(batch)
|
||||
|
||||
result = _read_table(path_batch).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.large_memory
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_chunk_size(tempdir):
|
||||
default_chunk_size = 1024 * 1024
|
||||
abs_max_chunk_size = 64 * 1024 * 1024
|
||||
|
||||
def check_chunk_size(data_size, chunk_size, expect_num_chunks):
|
||||
table = pa.Table.from_arrays([
|
||||
_range_integers(data_size, 'b')
|
||||
], names=['x'])
|
||||
if chunk_size is None:
|
||||
pq.write_table(table, tempdir / 'test.parquet')
|
||||
else:
|
||||
pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
|
||||
metadata = pq.read_metadata(tempdir / 'test.parquet')
|
||||
expected_chunk_size = default_chunk_size if chunk_size is None else chunk_size
|
||||
assert metadata.num_row_groups == expect_num_chunks
|
||||
latched_chunk_size = min(expected_chunk_size, abs_max_chunk_size)
|
||||
# First chunks should be full size
|
||||
for chunk_idx in range(expect_num_chunks - 1):
|
||||
assert metadata.row_group(chunk_idx).num_rows == latched_chunk_size
|
||||
# Last chunk may be smaller
|
||||
remainder = data_size - (expected_chunk_size * (expect_num_chunks - 1))
|
||||
if remainder == 0:
|
||||
assert metadata.row_group(
|
||||
expect_num_chunks - 1).num_rows == latched_chunk_size
|
||||
else:
|
||||
assert metadata.row_group(expect_num_chunks - 1).num_rows == remainder
|
||||
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size - 100, 3)
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size, 2)
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size + 100, 2)
|
||||
check_chunk_size(default_chunk_size + 100, default_chunk_size + 100, 1)
|
||||
# Even though the chunk size requested is large enough it will be capped
|
||||
# by the absolute max chunk size
|
||||
check_chunk_size(abs_max_chunk_size * 2, abs_max_chunk_size * 2, 2)
|
||||
|
||||
# These tests don't pass a chunk_size to write_table and so the chunk size
|
||||
# should be default_chunk_size
|
||||
check_chunk_size(default_chunk_size, None, 1)
|
||||
check_chunk_size(default_chunk_size + 1, None, 2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None,
|
||||
fs.LocalFileSystem(),
|
||||
])
|
||||
def test_parquet_writer_filesystem_local(tempdir, filesystem):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3(s3_example_fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, uri, path = s3_example_fs
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=fs, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(uri).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, uri, path = s3_example_fs
|
||||
|
||||
with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path, filesystem=fs).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, directory = s3_example_s3fs
|
||||
path = directory + "/test.parquet"
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=fs, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path, filesystem=fs).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.numpy
|
||||
def test_parquet_writer_filesystem_buffer_raises():
|
||||
table = _test_table(100)
|
||||
filesystem = fs.LocalFileSystem()
|
||||
|
||||
# Should raise ValueError when filesystem is passed with file-like object
|
||||
with pytest.raises(ValueError, match="specified path is file-like"):
|
||||
pq.ParquetWriter(
|
||||
pa.BufferOutputStream(), table.schema, filesystem=filesystem
|
||||
)
|
||||
|
||||
|
||||
def test_parquet_writer_store_schema(tempdir):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
|
||||
# default -> write schema information
|
||||
path1 = tempdir / 'test_with_schema.parquet'
|
||||
with pq.ParquetWriter(path1, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
meta = pq.read_metadata(path1)
|
||||
assert b'ARROW:schema' in meta.metadata
|
||||
assert meta.metadata[b'ARROW:schema']
|
||||
|
||||
# disable adding schema information
|
||||
path2 = tempdir / 'test_without_schema.parquet'
|
||||
with pq.ParquetWriter(path2, table.schema, store_schema=False) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
meta = pq.read_metadata(path2)
|
||||
assert meta.metadata is None
|
||||
|
||||
|
||||
def test_parquet_writer_append_key_value_metadata(tempdir):
|
||||
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
|
||||
path = tempdir / 'metadata.parquet'
|
||||
|
||||
with pq.ParquetWriter(path, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
writer.add_key_value_metadata({'key1': '1', 'key2': 'x'})
|
||||
writer.add_key_value_metadata({'key2': '2', 'key3': '3'})
|
||||
reader = pq.ParquetFile(path)
|
||||
metadata = reader.metadata.metadata
|
||||
assert metadata[b'key1'] == b'1'
|
||||
assert metadata[b'key2'] == b'2'
|
||||
assert metadata[b'key3'] == b'3'
|
||||
|
||||
|
||||
def test_parquet_content_defined_chunking(tempdir):
|
||||
table = pa.table({'a': range(100_000)})
|
||||
|
||||
# use PLAIN encoding because we compare the overall size of the row groups
|
||||
# which would vary depending on the encoding making the assertions wrong
|
||||
pq.write_table(table, tempdir / 'unchunked.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN")
|
||||
pq.write_table(table, tempdir / 'chunked-default.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN",
|
||||
use_content_defined_chunking=True)
|
||||
pq.write_table(table, tempdir / 'chunked-custom.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN",
|
||||
use_content_defined_chunking={"min_chunk_size": 32_768,
|
||||
"max_chunk_size": 65_536})
|
||||
|
||||
# the data must be the same
|
||||
unchunked = pq.read_table(tempdir / 'unchunked.parquet')
|
||||
chunked_default = pq.read_table(tempdir / 'chunked-default.parquet')
|
||||
chunked_custom = pq.read_table(tempdir / 'chunked-custom.parquet')
|
||||
assert unchunked.equals(chunked_default)
|
||||
assert unchunked.equals(chunked_custom)
|
||||
|
||||
# number of row groups and their sizes are not affected by content defined chunking
|
||||
unchunked_metadata = pq.read_metadata(tempdir / 'unchunked.parquet')
|
||||
chunked_default_metadata = pq.read_metadata(tempdir / 'chunked-default.parquet')
|
||||
chunked_custom_metadata = pq.read_metadata(tempdir / 'chunked-custom.parquet')
|
||||
|
||||
assert unchunked_metadata.num_row_groups == chunked_default_metadata.num_row_groups
|
||||
assert unchunked_metadata.num_row_groups == chunked_custom_metadata.num_row_groups
|
||||
|
||||
for i in range(unchunked_metadata.num_row_groups):
|
||||
rg_unchunked = unchunked_metadata.row_group(i)
|
||||
rg_chunked_default = chunked_default_metadata.row_group(i)
|
||||
rg_chunked_custom = chunked_custom_metadata.row_group(i)
|
||||
assert rg_unchunked.num_rows == rg_chunked_default.num_rows
|
||||
assert rg_unchunked.num_rows == rg_chunked_custom.num_rows
|
||||
# since PageReader is not exposed we cannot inspect the page sizes
|
||||
# so just check that the total byte size is different
|
||||
assert rg_unchunked.total_byte_size < rg_chunked_default.total_byte_size
|
||||
assert rg_unchunked.total_byte_size < rg_chunked_custom.total_byte_size
|
||||
assert rg_chunked_default.total_byte_size < rg_chunked_custom.total_byte_size
|
||||
|
||||
|
||||
def test_parquet_content_defined_chunking_parameters(tempdir):
|
||||
table = pa.table({'a': range(100)})
|
||||
path = tempdir / 'chunked-invalid.parquet'
|
||||
|
||||
# it raises OSError, not ideal but this is how parquet exceptions are handled
|
||||
# currently
|
||||
msg = "max_chunk_size must be greater than min_chunk_size"
|
||||
with pytest.raises(Exception, match=msg):
|
||||
cdc_options = {"min_chunk_size": 65_536, "max_chunk_size": 32_768}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
cases = [
|
||||
(
|
||||
{"min_chunk_size": 64 * 1024, "unknown_option": True},
|
||||
"Unknown options in 'use_content_defined_chunking': {'unknown_option'}"
|
||||
),
|
||||
(
|
||||
{"min_chunk_size": 64 * 1024},
|
||||
"Missing options in 'use_content_defined_chunking': {'max_chunk_size'}"
|
||||
),
|
||||
(
|
||||
{"max_chunk_size": 64 * 1024},
|
||||
"Missing options in 'use_content_defined_chunking': {'min_chunk_size'}"
|
||||
)
|
||||
]
|
||||
for cdc_options, msg in cases:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
# using the default parametrization
|
||||
pq.write_table(table, path, use_content_defined_chunking=True)
|
||||
|
||||
# using min_chunk_size and max_chunk_size
|
||||
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
# using min_chunk_size, max_chunk_size and norm_level
|
||||
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_level": 1}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
Reference in New Issue
Block a user