Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/orc/adapter.h
+++ b/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/orc/adapter.h
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/adapters/orc/options.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+/// \brief Information about an ORC stripe
+struct StripeInformation {
+  /// \brief Offset of the stripe from the start of the file, in bytes
+  int64_t offset;
+  /// \brief Length of the stripe, in bytes
+  int64_t length;
+  /// \brief Number of rows in the stripe
+  int64_t num_rows;
+  /// \brief Index of the first row of the stripe
+  int64_t first_row_id;
+};
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+  ~ORCFileReader();
+
+  /// \brief Creates a new ORC reader
+  ///
+  /// \param[in] file the data source
+  /// \param[in] pool a MemoryPool to use for buffer allocations
+  /// \return the returned reader object
+  static Result<std::unique_ptr<ORCFileReader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);
+
+  /// \brief Return the schema read from the ORC file
+  ///
+  /// \return the returned Schema object
+  Result<std::shared_ptr<Schema>> ReadSchema();
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read();
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] schema the Table schema
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] include_names the selected field names to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] schema the Table schema
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
+                                      const std::vector<int>& include_indices);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(
+      int64_t stripe, const std::vector<int>& include_indices);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[in] include_names the selected field names to read
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(
+      int64_t stripe, const std::vector<std::string>& include_names);
+
+  /// \brief Seek to designated row. Invoke NextStripeReader() after seek
+  ///        will return stripe reader starting from designated row.
+  ///
+  /// \param[in] row_number the rows number to seek
+  Status Seek(int64_t row_number);
+
+  /// \brief Get a stripe level record batch iterator.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
+  /// which may cause OOM issues by loading the whole stripe into memory.
+  ///
+  /// Note this will only read rows for the current stripe, not the entire
+  /// file.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \return the returned stripe reader
+  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);
+
+  /// \brief Get a stripe level record batch iterator.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
+  /// which may cause OOM issues by loading the whole stripe into memory.
+  ///
+  /// Note this will only read rows for the current stripe, not the entire
+  /// file.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the stripe reader
+  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+      int64_t batch_size, const std::vector<int>& include_indices);
+
+  /// \brief Get a record batch iterator for the entire file.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \param[in] include_names the selected field names to read, if not empty
+  /// (otherwise all fields are read)
+  /// \return the record batch iterator
+  Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
+      int64_t batch_size, const std::vector<std::string>& include_names);
+
+  /// \brief The number of stripes in the file
+  int64_t NumberOfStripes();
+
+  /// \brief The number of rows in the file
+  int64_t NumberOfRows();
+
+  /// \brief StripeInformation for each stripe.
+  StripeInformation GetStripeInformation(int64_t stripe);
+
+  /// \brief Get the format version of the file.
+  ///         Currently known values are 0.11 and 0.12.
+  ///
+  /// \return The FileVersion of the ORC file.
+  FileVersion GetFileVersion();
+
+  /// \brief Get the software instance and version that wrote this file.
+  ///
+  /// \return a user-facing string that specifies the software version
+  std::string GetSoftwareVersion();
+
+  /// \brief Get the compression kind of the file.
+  ///
+  /// \return The kind of compression in the ORC file.
+  Result<Compression::type> GetCompression();
+
+  /// \brief Get the buffer size for the compression.
+  ///
+  /// \return Number of bytes to buffer for the compression codec.
+  int64_t GetCompressionSize();
+
+  /// \brief Get the number of rows per an entry in the row index.
+  /// \return the number of rows per an entry in the row index or 0 if there
+  ///          is no row index.
+  int64_t GetRowIndexStride();
+
+  /// \brief Get ID of writer that generated the file.
+  ///
+  /// \return UNKNOWN_WRITER if the writer ID is undefined
+  WriterId GetWriterId();
+
+  /// \brief Get the writer id value when getWriterId() returns an unknown writer.
+  ///
+  /// \return the integer value of the writer ID.
+  int32_t GetWriterIdValue();
+
+  /// \brief Get the version of the writer.
+  ///
+  /// \return the version of the writer.
+
+  WriterVersion GetWriterVersion();
+
+  /// \brief Get the number of stripe statistics in the file.
+  ///
+  /// \return the number of stripe statistics
+  int64_t GetNumberOfStripeStatistics();
+
+  /// \brief Get the length of the data stripes in the file.
+  ///
+  /// \return return the number of bytes in stripes
+  int64_t GetContentLength();
+
+  /// \brief Get the length of the file stripe statistics.
+  ///
+  /// \return the number of compressed bytes in the file stripe statistics
+  int64_t GetStripeStatisticsLength();
+
+  /// \brief Get the length of the file footer.
+  ///
+  /// \return the number of compressed bytes in the file footer
+  int64_t GetFileFooterLength();
+
+  /// \brief Get the length of the file postscript.
+  ///
+  /// \return the number of bytes in the file postscript
+  int64_t GetFilePostscriptLength();
+
+  /// \brief Get the total length of the file.
+  ///
+  /// \return the number of bytes in the file
+  int64_t GetFileLength();
+
+  /// \brief Get the serialized file tail.
+  ///         Useful if another reader of the same file wants to avoid re-reading
+  ///         the file tail. See ReadOptions.SetSerializedFileTail().
+  ///
+  /// \return a string of bytes with the file tail
+  std::string GetSerializedFileTail();
+
+  /// \brief Return the metadata read from the ORC file
+  ///
+  /// \return A KeyValueMetadata object containing the ORC metadata
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  ORCFileReader();
+};
+
+/// \class ORCFileWriter
+/// \brief Write an Arrow Table or RecordBatch to an ORC file.
+class ARROW_EXPORT ORCFileWriter {
+ public:
+  ~ORCFileWriter();
+  /// \brief Creates a new ORC writer.
+  ///
+  /// \param[in] output_stream a pointer to the io::OutputStream to write into
+  /// \param[in] write_options the ORC writer options for Arrow
+  /// \return the returned writer object
+  static Result<std::unique_ptr<ORCFileWriter>> Open(
+      io::OutputStream* output_stream,
+      const WriteOptions& write_options = WriteOptions());
+
+  /// \brief Write a table. This can be called multiple times.
+  ///
+  /// Tables passed in subsequent calls must match the schema of the table that was
+  /// written first.
+  ///
+  /// \param[in] table the Arrow table from which data is extracted.
+  /// \return Status
+  Status Write(const Table& table);
+
+  /// \brief Write a RecordBatch. This can be called multiple times.
+  ///
+  /// RecordBatches passed in subsequent calls must match the schema of the
+  /// RecordBatch that was written first.
+  ///
+  /// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
+  /// \return Status
+  Status Write(const RecordBatch& record_batch);
+
+  /// \brief Close an ORC writer (orc::Writer)
+  ///
+  /// \return Status
+  Status Close();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+ private:
+  ORCFileWriter();
+};
+
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
--- a/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/orc/options.h
+++ b/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/orc/options.h
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version_;
+  int32_t minor_version_;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version_(major), minor_version_(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major_version() const { return this->major_version_; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor_version() const { return this->minor_version_; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version() == right.major_version() &&
+           this->minor_version() == right.minor_version();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;
+  /// Which ORC file version to use, default FileVersion(0, 12)
+  FileVersion file_version = FileVersion(0, 12);
+  /// Size of each ORC stripe in bytes, default 64 MiB
+  int64_t stripe_size = 64 * 1024 * 1024;
+  /// The compression codec of the ORC file, there is no compression by default
+  Compression::type compression = Compression::UNCOMPRESSED;
+  /// The size of each compression block in bytes, default 64 KiB
+  int64_t compression_block_size = 64 * 1024;
+  /// The compression strategy i.e. speed vs size reduction, default
+  /// CompressionStrategy::kSpeed
+  CompressionStrategy compression_strategy = CompressionStrategy::kSpeed;
+  /// The number of rows per an entry in the row index, default 10000
+  int64_t row_index_stride = 10000;
+  /// The padding tolerance, default 0.0
+  double padding_tolerance = 0.0;
+  /// The dictionary key size threshold. 0 to disable dictionary encoding.
+  /// 1 to always enable dictionary encoding, default 0.0
+  double dictionary_key_size_threshold = 0.0;
+  /// The array of columns that use the bloom filter, default empty
+  std::vector<int64_t> bloom_filter_columns;
+  /// The upper limit of the false-positive rate of the bloom filter, default 0.05
+  double bloom_filter_fpp = 0.05;
+};
+
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
--- a/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/tensorflow/convert.h
+++ b/venv/lib/python3.10/site-packages/pyarrow/include/arrow/adapters/tensorflow/convert.h
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "tensorflow/core/framework/op.h"
+
+#include "arrow/type.h"
+
+// These utilities are supposed to be included in TensorFlow operators
+// that need to be compiled separately from Arrow because of ABI issues.
+// They therefore need to be header-only.
+
+namespace arrow {
+
+namespace adapters {
+
+namespace tensorflow {
+
+Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
+  switch (dtype) {
+    case ::tensorflow::DT_BOOL:
+      *out = arrow::boolean();
+      break;
+    case ::tensorflow::DT_FLOAT:
+      *out = arrow::float32();
+      break;
+    case ::tensorflow::DT_DOUBLE:
+      *out = arrow::float64();
+      break;
+    case ::tensorflow::DT_HALF:
+      *out = arrow::float16();
+      break;
+    case ::tensorflow::DT_INT8:
+      *out = arrow::int8();
+      break;
+    case ::tensorflow::DT_INT16:
+      *out = arrow::int16();
+      break;
+    case ::tensorflow::DT_INT32:
+      *out = arrow::int32();
+      break;
+    case ::tensorflow::DT_INT64:
+      *out = arrow::int64();
+      break;
+    case ::tensorflow::DT_UINT8:
+      *out = arrow::uint8();
+      break;
+    case ::tensorflow::DT_UINT16:
+      *out = arrow::uint16();
+      break;
+    case ::tensorflow::DT_UINT32:
+      *out = arrow::uint32();
+      break;
+    case ::tensorflow::DT_UINT64:
+      *out = arrow::uint64();
+      break;
+    default:
+      return Status::TypeError("TensorFlow data type is not supported");
+  }
+  return Status::OK();
+}
+
+Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
+  switch (dtype->id()) {
+    case Type::BOOL:
+      *out = ::tensorflow::DT_BOOL;
+      break;
+    case Type::UINT8:
+      *out = ::tensorflow::DT_UINT8;
+      break;
+    case Type::INT8:
+      *out = ::tensorflow::DT_INT8;
+      break;
+    case Type::UINT16:
+      *out = ::tensorflow::DT_UINT16;
+      break;
+    case Type::INT16:
+      *out = ::tensorflow::DT_INT16;
+      break;
+    case Type::UINT32:
+      *out = ::tensorflow::DT_UINT32;
+      break;
+    case Type::INT32:
+      *out = ::tensorflow::DT_INT32;
+      break;
+    case Type::UINT64:
+      *out = ::tensorflow::DT_UINT64;
+      break;
+    case Type::INT64:
+      *out = ::tensorflow::DT_INT64;
+      break;
+    case Type::HALF_FLOAT:
+      *out = ::tensorflow::DT_HALF;
+      break;
+    case Type::FLOAT:
+      *out = ::tensorflow::DT_FLOAT;
+      break;
+    case Type::DOUBLE:
+      *out = ::tensorflow::DT_DOUBLE;
+      break;
+    default:
+      return Status::TypeError("Arrow data type is not supported");
+  }
+  return arrow::Status::OK();
+}
+
+}  // namespace tensorflow
+
+}  // namespace adapters
+
+}  // namespace arrow