Initial commit
This commit is contained in:
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/adapters/orc/options.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace adapters {
|
||||
namespace orc {
|
||||
|
||||
/// \brief Information about an ORC stripe
|
||||
struct StripeInformation {
|
||||
/// \brief Offset of the stripe from the start of the file, in bytes
|
||||
int64_t offset;
|
||||
/// \brief Length of the stripe, in bytes
|
||||
int64_t length;
|
||||
/// \brief Number of rows in the stripe
|
||||
int64_t num_rows;
|
||||
/// \brief Index of the first row of the stripe
|
||||
int64_t first_row_id;
|
||||
};
|
||||
|
||||
/// \class ORCFileReader
|
||||
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
|
||||
class ARROW_EXPORT ORCFileReader {
|
||||
public:
|
||||
~ORCFileReader();
|
||||
|
||||
/// \brief Creates a new ORC reader
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] pool a MemoryPool to use for buffer allocations
|
||||
/// \return the returned reader object
|
||||
static Result<std::unique_ptr<ORCFileReader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);
|
||||
|
||||
/// \brief Return the schema read from the ORC file
|
||||
///
|
||||
/// \return the returned Schema object
|
||||
Result<std::shared_ptr<Schema>> ReadSchema();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
|
||||
const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Seek to designated row. Invoke NextStripeReader() after seek
|
||||
/// will return stripe reader starting from designated row.
|
||||
///
|
||||
/// \param[in] row_number the rows number to seek
|
||||
Status Seek(int64_t row_number);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine-grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \return the returned stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine-grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
||||
int64_t batch_size, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Get a record batch iterator for the entire file.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_names the selected field names to read, if not empty
|
||||
/// (otherwise all fields are read)
|
||||
/// \return the record batch iterator
|
||||
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
|
||||
int64_t batch_size, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief The number of stripes in the file
|
||||
int64_t NumberOfStripes();
|
||||
|
||||
/// \brief The number of rows in the file
|
||||
int64_t NumberOfRows();
|
||||
|
||||
/// \brief StripeInformation for each stripe.
|
||||
StripeInformation GetStripeInformation(int64_t stripe);
|
||||
|
||||
/// \brief Get the format version of the file.
|
||||
/// Currently known values are 0.11 and 0.12.
|
||||
///
|
||||
/// \return The FileVersion of the ORC file.
|
||||
FileVersion GetFileVersion();
|
||||
|
||||
/// \brief Get the software instance and version that wrote this file.
|
||||
///
|
||||
/// \return a user-facing string that specifies the software version
|
||||
std::string GetSoftwareVersion();
|
||||
|
||||
/// \brief Get the compression kind of the file.
|
||||
///
|
||||
/// \return The kind of compression in the ORC file.
|
||||
Result<Compression::type> GetCompression();
|
||||
|
||||
/// \brief Get the buffer size for the compression.
|
||||
///
|
||||
/// \return Number of bytes to buffer for the compression codec.
|
||||
int64_t GetCompressionSize();
|
||||
|
||||
/// \brief Get the number of rows per an entry in the row index.
|
||||
/// \return the number of rows per an entry in the row index or 0 if there
|
||||
/// is no row index.
|
||||
int64_t GetRowIndexStride();
|
||||
|
||||
/// \brief Get ID of writer that generated the file.
|
||||
///
|
||||
/// \return UNKNOWN_WRITER if the writer ID is undefined
|
||||
WriterId GetWriterId();
|
||||
|
||||
/// \brief Get the writer id value when getWriterId() returns an unknown writer.
|
||||
///
|
||||
/// \return the integer value of the writer ID.
|
||||
int32_t GetWriterIdValue();
|
||||
|
||||
/// \brief Get the version of the writer.
|
||||
///
|
||||
/// \return the version of the writer.
|
||||
|
||||
WriterVersion GetWriterVersion();
|
||||
|
||||
/// \brief Get the number of stripe statistics in the file.
|
||||
///
|
||||
/// \return the number of stripe statistics
|
||||
int64_t GetNumberOfStripeStatistics();
|
||||
|
||||
/// \brief Get the length of the data stripes in the file.
|
||||
///
|
||||
/// \return return the number of bytes in stripes
|
||||
int64_t GetContentLength();
|
||||
|
||||
/// \brief Get the length of the file stripe statistics.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file stripe statistics
|
||||
int64_t GetStripeStatisticsLength();
|
||||
|
||||
/// \brief Get the length of the file footer.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file footer
|
||||
int64_t GetFileFooterLength();
|
||||
|
||||
/// \brief Get the length of the file postscript.
|
||||
///
|
||||
/// \return the number of bytes in the file postscript
|
||||
int64_t GetFilePostscriptLength();
|
||||
|
||||
/// \brief Get the total length of the file.
|
||||
///
|
||||
/// \return the number of bytes in the file
|
||||
int64_t GetFileLength();
|
||||
|
||||
/// \brief Get the serialized file tail.
|
||||
/// Useful if another reader of the same file wants to avoid re-reading
|
||||
/// the file tail. See ReadOptions.SetSerializedFileTail().
|
||||
///
|
||||
/// \return a string of bytes with the file tail
|
||||
std::string GetSerializedFileTail();
|
||||
|
||||
/// \brief Return the metadata read from the ORC file
|
||||
///
|
||||
/// \return A KeyValueMetadata object containing the ORC metadata
|
||||
Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
ORCFileReader();
|
||||
};
|
||||
|
||||
/// \class ORCFileWriter
|
||||
/// \brief Write an Arrow Table or RecordBatch to an ORC file.
|
||||
class ARROW_EXPORT ORCFileWriter {
|
||||
public:
|
||||
~ORCFileWriter();
|
||||
/// \brief Creates a new ORC writer.
|
||||
///
|
||||
/// \param[in] output_stream a pointer to the io::OutputStream to write into
|
||||
/// \param[in] write_options the ORC writer options for Arrow
|
||||
/// \return the returned writer object
|
||||
static Result<std::unique_ptr<ORCFileWriter>> Open(
|
||||
io::OutputStream* output_stream,
|
||||
const WriteOptions& write_options = WriteOptions());
|
||||
|
||||
/// \brief Write a table. This can be called multiple times.
|
||||
///
|
||||
/// Tables passed in subsequent calls must match the schema of the table that was
|
||||
/// written first.
|
||||
///
|
||||
/// \param[in] table the Arrow table from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const Table& table);
|
||||
|
||||
/// \brief Write a RecordBatch. This can be called multiple times.
|
||||
///
|
||||
/// RecordBatches passed in subsequent calls must match the schema of the
|
||||
/// RecordBatch that was written first.
|
||||
///
|
||||
/// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const RecordBatch& record_batch);
|
||||
|
||||
/// \brief Close an ORC writer (orc::Writer)
|
||||
///
|
||||
/// \return Status
|
||||
Status Close();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
|
||||
private:
|
||||
ORCFileWriter();
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,120 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace orc {
|
||||
|
||||
enum class WriterId : int32_t {
|
||||
kOrcJava = 0,
|
||||
kOrcCpp = 1,
|
||||
kPresto = 2,
|
||||
kScritchleyGo = 3,
|
||||
kTrino = 4,
|
||||
kUnknown = INT32_MAX
|
||||
};
|
||||
|
||||
enum class WriterVersion : int32_t {
|
||||
kOriginal = 0,
|
||||
kHive8732 = 1,
|
||||
kHive4243 = 2,
|
||||
kHive12055 = 3,
|
||||
kHive13083 = 4,
|
||||
kOrc101 = 5,
|
||||
kOrc135 = 6,
|
||||
kOrc517 = 7,
|
||||
kOrc203 = 8,
|
||||
kOrc14 = 9,
|
||||
kMax = INT32_MAX
|
||||
};
|
||||
|
||||
enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
|
||||
|
||||
class ARROW_EXPORT FileVersion {
|
||||
private:
|
||||
int32_t major_version_;
|
||||
int32_t minor_version_;
|
||||
|
||||
public:
|
||||
static const FileVersion& v_0_11();
|
||||
static const FileVersion& v_0_12();
|
||||
|
||||
FileVersion(int32_t major, int32_t minor)
|
||||
: major_version_(major), minor_version_(minor) {}
|
||||
|
||||
/**
|
||||
* Get major version
|
||||
*/
|
||||
int32_t major_version() const { return this->major_version_; }
|
||||
|
||||
/**
|
||||
* Get minor version
|
||||
*/
|
||||
int32_t minor_version() const { return this->minor_version_; }
|
||||
|
||||
bool operator==(const FileVersion& right) const {
|
||||
return this->major_version() == right.major_version() &&
|
||||
this->minor_version() == right.minor_version();
|
||||
}
|
||||
|
||||
bool operator!=(const FileVersion& right) const { return !(*this == right); }
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
/// Options for the ORC Writer
|
||||
struct ARROW_EXPORT WriteOptions {
|
||||
/// Number of rows the ORC writer writes at a time, default 1024
|
||||
int64_t batch_size = 1024;
|
||||
/// Which ORC file version to use, default FileVersion(0, 12)
|
||||
FileVersion file_version = FileVersion(0, 12);
|
||||
/// Size of each ORC stripe in bytes, default 64 MiB
|
||||
int64_t stripe_size = 64 * 1024 * 1024;
|
||||
/// The compression codec of the ORC file, there is no compression by default
|
||||
Compression::type compression = Compression::UNCOMPRESSED;
|
||||
/// The size of each compression block in bytes, default 64 KiB
|
||||
int64_t compression_block_size = 64 * 1024;
|
||||
/// The compression strategy i.e. speed vs size reduction, default
|
||||
/// CompressionStrategy::kSpeed
|
||||
CompressionStrategy compression_strategy = CompressionStrategy::kSpeed;
|
||||
/// The number of rows per an entry in the row index, default 10000
|
||||
int64_t row_index_stride = 10000;
|
||||
/// The padding tolerance, default 0.0
|
||||
double padding_tolerance = 0.0;
|
||||
/// The dictionary key size threshold. 0 to disable dictionary encoding.
|
||||
/// 1 to always enable dictionary encoding, default 0.0
|
||||
double dictionary_key_size_threshold = 0.0;
|
||||
/// The array of columns that use the bloom filter, default empty
|
||||
std::vector<int64_t> bloom_filter_columns;
|
||||
/// The upper limit of the false-positive rate of the bloom filter, default 0.05
|
||||
double bloom_filter_fpp = 0.05;
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,128 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
|
||||
#include "arrow/type.h"
|
||||
|
||||
// These utilities are supposed to be included in TensorFlow operators
|
||||
// that need to be compiled separately from Arrow because of ABI issues.
|
||||
// They therefore need to be header-only.
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
|
||||
switch (dtype) {
|
||||
case ::tensorflow::DT_BOOL:
|
||||
*out = arrow::boolean();
|
||||
break;
|
||||
case ::tensorflow::DT_FLOAT:
|
||||
*out = arrow::float32();
|
||||
break;
|
||||
case ::tensorflow::DT_DOUBLE:
|
||||
*out = arrow::float64();
|
||||
break;
|
||||
case ::tensorflow::DT_HALF:
|
||||
*out = arrow::float16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT8:
|
||||
*out = arrow::int8();
|
||||
break;
|
||||
case ::tensorflow::DT_INT16:
|
||||
*out = arrow::int16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT32:
|
||||
*out = arrow::int32();
|
||||
break;
|
||||
case ::tensorflow::DT_INT64:
|
||||
*out = arrow::int64();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT8:
|
||||
*out = arrow::uint8();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT16:
|
||||
*out = arrow::uint16();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT32:
|
||||
*out = arrow::uint32();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT64:
|
||||
*out = arrow::uint64();
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("TensorFlow data type is not supported");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
|
||||
switch (dtype->id()) {
|
||||
case Type::BOOL:
|
||||
*out = ::tensorflow::DT_BOOL;
|
||||
break;
|
||||
case Type::UINT8:
|
||||
*out = ::tensorflow::DT_UINT8;
|
||||
break;
|
||||
case Type::INT8:
|
||||
*out = ::tensorflow::DT_INT8;
|
||||
break;
|
||||
case Type::UINT16:
|
||||
*out = ::tensorflow::DT_UINT16;
|
||||
break;
|
||||
case Type::INT16:
|
||||
*out = ::tensorflow::DT_INT16;
|
||||
break;
|
||||
case Type::UINT32:
|
||||
*out = ::tensorflow::DT_UINT32;
|
||||
break;
|
||||
case Type::INT32:
|
||||
*out = ::tensorflow::DT_INT32;
|
||||
break;
|
||||
case Type::UINT64:
|
||||
*out = ::tensorflow::DT_UINT64;
|
||||
break;
|
||||
case Type::INT64:
|
||||
*out = ::tensorflow::DT_INT64;
|
||||
break;
|
||||
case Type::HALF_FLOAT:
|
||||
*out = ::tensorflow::DT_HALF;
|
||||
break;
|
||||
case Type::FLOAT:
|
||||
*out = ::tensorflow::DT_FLOAT;
|
||||
break;
|
||||
case Type::DOUBLE:
|
||||
*out = ::tensorflow::DT_DOUBLE;
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("Arrow data type is not supported");
|
||||
}
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
} // namespace adapters
|
||||
|
||||
} // namespace arrow
|
||||
Reference in New Issue
Block a user