Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/json/options.h"
#include "arrow/json/reader.h"

View File

@@ -0,0 +1,68 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <vector>
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace json {
class PromotionGraph;
class ARROW_EXPORT ChunkedArrayBuilder {
public:
virtual ~ChunkedArrayBuilder() = default;
/// Spawn a task that will try to convert and insert the given JSON block
virtual void Insert(int64_t block_index,
const std::shared_ptr<Field>& unconverted_field,
const std::shared_ptr<Array>& unconverted) = 0;
/// Return the final chunked array.
/// Every chunk must be inserted before this is called!
virtual Status Finish(std::shared_ptr<ChunkedArray>* out) = 0;
/// Finish current task group and substitute a new one
virtual Status ReplaceTaskGroup(
const std::shared_ptr<arrow::internal::TaskGroup>& task_group) = 0;
protected:
explicit ChunkedArrayBuilder(
const std::shared_ptr<arrow::internal::TaskGroup>& task_group)
: task_group_(task_group) {}
std::shared_ptr<arrow::internal::TaskGroup> task_group_;
};
/// create a chunked builder
///
/// if unexpected fields and promotion need to be handled, promotion_graph must be
/// non-null
ARROW_EXPORT Status MakeChunkedArrayBuilder(
const std::shared_ptr<arrow::internal::TaskGroup>& task_group, MemoryPool* pool,
const PromotionGraph* promotion_graph, const std::shared_ptr<DataType>& type,
std::shared_ptr<ChunkedArrayBuilder>* out);
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,35 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/util/delimiting.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace json {
struct ParseOptions;
ARROW_EXPORT
std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options);
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "arrow/status.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
class DataType;
class Field;
class MemoryPool;
namespace json {
/// \brief interface for conversion of Arrays
///
/// Converters are not required to be correct for arbitrary input- only
/// for unconverted arrays emitted by a corresponding parser.
class ARROW_EXPORT Converter {
public:
virtual ~Converter() = default;
/// convert an array
/// on failure, this converter may be promoted to another converter which
/// *can* convert the given input.
virtual Status Convert(const std::shared_ptr<Array>& in,
std::shared_ptr<Array>* out) = 0;
std::shared_ptr<DataType> out_type() const { return out_type_; }
MemoryPool* pool() { return pool_; }
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
Converter(MemoryPool* pool, const std::shared_ptr<DataType>& out_type)
: pool_(pool), out_type_(out_type) {}
MemoryPool* pool_;
std::shared_ptr<DataType> out_type_;
};
/// \brief produce a single converter to the specified out_type
ARROW_EXPORT Status MakeConverter(const std::shared_ptr<DataType>& out_type,
MemoryPool* pool, std::shared_ptr<Converter>* out);
class ARROW_EXPORT PromotionGraph {
public:
virtual ~PromotionGraph() = default;
/// \brief produce a valid field which will be inferred as null
virtual std::shared_ptr<Field> Null(const std::string& name) const = 0;
/// \brief given an unexpected field encountered during parsing, return a type to which
/// it may be convertible (may return null if none is available)
virtual std::shared_ptr<DataType> Infer(
const std::shared_ptr<Field>& unexpected_field) const = 0;
/// \brief given a type to which conversion failed, return a promoted type to which
/// conversion may succeed (may return null if none is available)
virtual std::shared_ptr<DataType> Promote(
const std::shared_ptr<DataType>& failed,
const std::shared_ptr<Field>& unexpected_field) const = 0;
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph);
PromotionGraph() = default;
};
ARROW_EXPORT const PromotionGraph* GetPromotionGraph();
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,112 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Implement a simple JSON representation format for arrays
#pragma once
#include <memory>
#include <string>
#include <string_view>
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
class DataType;
namespace json {
/// \defgroup array-from-json-string FromJSONString Helpers
///
/// These helpers are intended to be used in examples, tests, or for quick
/// prototyping and are not intended to be used where performance matters.
///
/// See the <a href="../arrays.html#fromjsonstring-helpers">User Guide</a> for
/// more information.
///
/// @{
/// \brief Create an Array from a JSON string
///
/// \code {.cpp}
/// Result<std::shared_ptr<Array>> maybe_array =
/// ArrayFromJSONString(int64(), "[2, 3, null, 7, 11]");
/// \endcode
ARROW_EXPORT
Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
const std::string& json);
/// \copydoc ArrayFromJSONString(const std::shared_ptr<DataType>&, const std::string&)
ARROW_EXPORT
Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
std::string_view json);
/// \copydoc ArrayFromJSONString(const std::shared_ptr<DataType>&, const std::string&)
ARROW_EXPORT
Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
const char* json);
/// \brief Create a ChunkedArray from a JSON string
///
/// \code {.cpp}
/// Result<std::shared_ptr<ChunkedArray>> maybe_chunked_array =
/// ChunkedArrayFromJSONString(int64(), {R"([5, 10])", R"([null])", R"([16])"});
/// \endcode
ARROW_EXPORT
Result<std::shared_ptr<ChunkedArray>> ChunkedArrayFromJSONString(
const std::shared_ptr<DataType>& type, const std::vector<std::string>& json_strings);
/// \brief Create a DictionaryArray from a JSON string
///
/// \code {.cpp}
/// Result<std::shared_ptr<Array>> maybe_dict_array =
/// DictArrayFromJSONString(dictionary(int32(), utf8()), "[0, 1, 0, 2, 0, 3]",
/// R"(["k1", "k2", "k3", "k4"])");
/// \endcode
ARROW_EXPORT
Result<std::shared_ptr<Array>> DictArrayFromJSONString(const std::shared_ptr<DataType>&,
std::string_view indices_json,
std::string_view dictionary_json);
/// \brief Create a Scalar from a JSON string
/// \code {.cpp}
/// Result<std::shared_ptr<Scalar>> maybe_scalar =
/// ScalarFromJSONString(float64(), "42", &scalar);
/// \endcode
ARROW_EXPORT
Result<std::shared_ptr<Scalar>> ScalarFromJSONString(const std::shared_ptr<DataType>&,
std::string_view json);
/// \brief Create a DictionaryScalar from a JSON string
/// \code {.cpp}
/// Result<std::shared_ptr<Scalar>> maybe_dict_scalar =
/// DictScalarFromJSONString(dictionary(int32(), utf8()), "3", R"(["k1", "k2", "k3",
/// "k4"])", &scalar);
/// \endcode
ARROW_EXPORT
Result<std::shared_ptr<Scalar>> DictScalarFromJSONString(
const std::shared_ptr<DataType>&, std::string_view index_json,
std::string_view dictionary_json);
/// @}
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,54 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string_view>
#include <unordered_map>
#include "arrow/result.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace json {
namespace internal {
/// This class is a helper to parse a json object from a string.
/// It uses rapidjson::Document in implementation.
class ARROW_EXPORT ObjectParser {
public:
ObjectParser();
~ObjectParser();
Status Parse(std::string_view json);
Result<std::string> GetString(const char* key) const;
Result<bool> GetBool(const char* key) const;
// Get all members of the object as a map from string keys to string values
Result<std::unordered_map<std::string, std::string>> GetStringMap() const;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace internal
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <string_view>
#include "arrow/util/visibility.h"
namespace arrow {
namespace json {
namespace internal {
/// This class is a helper to serialize a json object to a string.
/// It uses rapidjson in implementation.
class ARROW_EXPORT ObjectWriter {
public:
ObjectWriter();
~ObjectWriter();
void SetString(std::string_view key, std::string_view value);
void SetBool(std::string_view key, bool value);
std::string Serialize();
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace internal
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,74 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/json/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
class DataType;
class Schema;
namespace json {
enum class UnexpectedFieldBehavior : char {
/// Unexpected JSON fields are ignored
Ignore,
/// Unexpected JSON fields error out
Error,
/// Unexpected JSON fields are type-inferred and included in the output
InferType
};
struct ARROW_EXPORT ParseOptions {
// Parsing options
/// Optional explicit schema (disables type inference on those fields)
std::shared_ptr<Schema> explicit_schema;
/// Whether objects may be printed across multiple lines (for example pretty-printed)
///
/// If true, parsing may be slower.
bool newlines_in_values = false;
/// How JSON fields outside of explicit_schema (if given) are treated
UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
/// Create parsing options with default values
static ParseOptions Defaults();
};
struct ARROW_EXPORT ReadOptions {
// Reader options
/// Whether to use the global CPU thread pool
bool use_threads = true;
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB
/// Create read options with default values
static ReadOptions Defaults();
};
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,107 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "arrow/json/options.h"
#include "arrow/status.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
class Buffer;
class MemoryPool;
class KeyValueMetadata;
class ResizableBuffer;
namespace json {
struct Kind {
enum type : uint8_t {
kNull,
kBoolean,
kNumber,
kString,
kArray,
kObject,
kNumberOrString
};
static const std::string& Name(Kind::type);
static const std::shared_ptr<const KeyValueMetadata>& Tag(Kind::type);
static Kind::type FromTag(const std::shared_ptr<const KeyValueMetadata>& tag);
static Status ForType(const DataType& type, Kind::type* kind);
};
/// \class BlockParser
/// \brief A reusable block-based parser for JSON data
///
/// The parser takes a block of newline delimited JSON data and extracts Arrays
/// of unconverted strings which can be fed to a Converter to obtain a usable Array.
///
/// Note that in addition to parse errors (such as malformed JSON) some conversion
/// errors are caught at parse time:
/// - A null value in non-nullable column
/// - Change in the JSON kind of a column. For example, if an explicit schema is provided
/// which stipulates that field "a" is integral, a row of {"a": "not a number"} will
/// result in an error. This also applies to fields outside an explicit schema.
class ARROW_EXPORT BlockParser {
public:
virtual ~BlockParser() = default;
/// \brief Reserve storage for scalars parsed from a block of json
virtual Status ReserveScalarStorage(int64_t nbytes) = 0;
/// \brief Parse a block of data
virtual Status Parse(const std::shared_ptr<Buffer>& json) = 0;
/// \brief Extract parsed data
virtual Status Finish(std::shared_ptr<Array>* parsed) = 0;
/// \brief Return the number of parsed rows
int32_t num_rows() const { return num_rows_; }
/// \brief Construct a BlockParser
///
/// \param[in] pool MemoryPool to use when constructing parsed array
/// \param[in] options ParseOptions to use when parsing JSON
/// \param[out] out constructed BlockParser
static Status Make(MemoryPool* pool, const ParseOptions& options,
std::unique_ptr<BlockParser>* out);
static Status Make(const ParseOptions& options, std::unique_ptr<BlockParser>* out);
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
explicit BlockParser(MemoryPool* pool) : pool_(pool) {}
MemoryPool* pool_;
int32_t num_rows_ = 0;
};
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,43 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Include this file before including any RapidJSON headers.
#pragma once
#define RAPIDJSON_HAS_STDSTRING 1
#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1
// rapidjson will be defined in namespace arrow::rapidjson
#define RAPIDJSON_NAMESPACE arrow::rapidjson
#define RAPIDJSON_NAMESPACE_BEGIN \
namespace arrow { \
namespace rapidjson {
#define RAPIDJSON_NAMESPACE_END \
} \
}
// enable SIMD whitespace skipping, if available
#if defined(ARROW_HAVE_SSE4_2)
# define RAPIDJSON_SSE2 1
# define RAPIDJSON_SSE42 1
#endif
#if defined(ARROW_HAVE_NEON)
# define RAPIDJSON_NEON 1
#endif

View File

@@ -0,0 +1,118 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/io/type_fwd.h"
#include "arrow/json/options.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace json {
/// A class that reads an entire JSON file into a Arrow Table
///
/// The file is expected to consist of individual line-separated JSON objects
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;
/// Read the entire JSON file and convert it to a Arrow Table
virtual Result<std::shared_ptr<Table>> Read() = 0;
/// Create a TableReader instance
static Result<std::shared_ptr<TableReader>> Make(MemoryPool* pool,
std::shared_ptr<io::InputStream> input,
const ReadOptions&,
const ParseOptions&);
};
ARROW_EXPORT Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options,
std::shared_ptr<Buffer> json);
/// \brief A class that reads a JSON file incrementally
///
/// JSON data is read from a stream in fixed-size blocks (configurable with
/// `ReadOptions::block_size`). Each block is converted to a `RecordBatch`. Yielded
/// batches have a consistent schema but may differ in row count.
///
/// The supplied `ParseOptions` are used to determine a schema, based either on a
/// provided explicit schema or inferred from the first non-empty block.
/// Afterwards, the target schema is frozen. If `UnexpectedFieldBehavior::InferType` is
/// specified, unexpected fields will only be inferred for the first block. Afterwards
/// they'll be treated as errors.
///
/// If `ReadOptions::use_threads` is `true`, each block's parsing/decoding task will be
/// parallelized on the given `cpu_executor` (with readahead corresponding to the
/// executor's capacity). If an executor isn't provided, the global thread pool will be
/// used.
///
/// If `ReadOptions::use_threads` is `false`, computations will be run on the calling
/// thread and `cpu_executor` will be ignored.
class ARROW_EXPORT StreamingReader : public RecordBatchReader {
public:
virtual ~StreamingReader() = default;
/// \brief Read the next `RecordBatch` asynchronously
/// This function is async-reentrant (but not synchronously reentrant). However, if
/// threading is disabled, this will block until completion.
virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
/// Get the number of bytes which have been successfully converted to record batches
/// and consumed
[[nodiscard]] virtual int64_t bytes_processed() const = 0;
/// \brief Create a `StreamingReader` from an `InputStream`
/// Blocks until the initial batch is loaded
///
/// \param[in] stream JSON source stream
/// \param[in] read_options Options for reading
/// \param[in] parse_options Options for chunking, parsing, and conversion
/// \param[in] io_context Context for IO operations (optional)
/// \param[in] cpu_executor Executor for computation tasks (optional)
/// \return The initialized reader
static Result<std::shared_ptr<StreamingReader>> Make(
std::shared_ptr<io::InputStream> stream, const ReadOptions& read_options,
const ParseOptions& parse_options,
const io::IOContext& io_context = io::default_io_context(),
::arrow::internal::Executor* cpu_executor = NULLPTR);
/// \brief Create a `StreamingReader` from an `InputStream` asynchronously
/// Returned future completes after loading the first batch
///
/// \param[in] stream JSON source stream
/// \param[in] read_options Options for reading
/// \param[in] parse_options Options for chunking, parsing, and conversion
/// \param[in] io_context Context for IO operations (optional)
/// \param[in] cpu_executor Executor for computation tasks (optional)
/// \return Future for the initialized reader
static Future<std::shared_ptr<StreamingReader>> MakeAsync(
std::shared_ptr<io::InputStream> stream, const ReadOptions& read_options,
const ParseOptions& parse_options,
const io::IOContext& io_context = io::default_io_context(),
::arrow::internal::Executor* cpu_executor = NULLPTR);
};
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,330 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/io/memory.h"
#include "arrow/json/converter.h"
#include "arrow/json/options.h"
#include "arrow/json/parser.h"
#include "arrow/json/rapidjson_defs.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/visit_type_inline.h"
#include "rapidjson/document.h"
#include "rapidjson/prettywriter.h"
#include "rapidjson/reader.h"
#include "rapidjson/writer.h"
namespace arrow {
using internal::checked_cast;
namespace json {
namespace rj = arrow::rapidjson;
using rj::StringBuffer;
using std::string_view;
using Writer = rj::Writer<StringBuffer>;
struct GenerateOptions {
// Probability of a field being written
double field_probability = 1.0;
// Probability of a value being null
double null_probability = 0.2;
// Whether to randomize the order of written fields
bool randomize_field_order = false;
static constexpr GenerateOptions Defaults() { return GenerateOptions{}; }
};
inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
template <typename Engine>
inline static Status Generate(
const std::shared_ptr<DataType>& type, Engine& e, Writer* writer,
const GenerateOptions& options = GenerateOptions::Defaults());
template <typename Engine>
inline static Status Generate(
const std::vector<std::shared_ptr<Field>>& fields, Engine& e, Writer* writer,
const GenerateOptions& options = GenerateOptions::Defaults());
template <typename Engine>
inline static Status Generate(
const std::shared_ptr<Schema>& schm, Engine& e, Writer* writer,
const GenerateOptions& options = GenerateOptions::Defaults()) {
return Generate(schm->fields(), e, writer, options);
}
template <typename Engine>
struct GenerateImpl {
Status Visit(const NullType&) { return OK(writer.Null()); }
Status Visit(const BooleanType&) {
return OK(writer.Bool(std::uniform_int_distribution<uint16_t>{}(e)&1));
}
template <typename T>
enable_if_physical_unsigned_integer<T, Status> Visit(const T&) {
auto val = std::uniform_int_distribution<>{}(e);
return OK(writer.Uint64(static_cast<typename T::c_type>(val)));
}
template <typename T>
enable_if_physical_signed_integer<T, Status> Visit(const T&) {
auto val = std::uniform_int_distribution<>{}(e);
return OK(writer.Int64(static_cast<typename T::c_type>(val)));
}
template <typename T>
enable_if_physical_floating_point<T, Status> Visit(const T&) {
auto val = std::normal_distribution<typename T::c_type>{0, 1 << 10}(e);
return OK(writer.Double(val));
}
Status GenerateAscii(const DataType&) {
auto size = std::poisson_distribution<>{4}(e);
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
std::string s(size, '\0');
for (char& ch : s) ch = static_cast<char>(gen_char(e));
return OK(writer.String(s.c_str()));
}
template <typename T>
enable_if_base_binary<T, Status> Visit(const T& t) {
return GenerateAscii(t);
}
Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
template <typename T>
enable_if_list_like<T, Status> Visit(const T& t) {
auto size = std::poisson_distribution<>{4}(e);
writer.StartArray();
for (int i = 0; i < size; ++i) {
RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options));
}
return OK(writer.EndArray(size));
}
Status Visit(const ListViewType& t) { return NotImplemented(t); }
Status Visit(const LargeListViewType& t) { return NotImplemented(t); }
Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); }
Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
Status Visit(const MonthDayNanoIntervalType& t) { return NotImplemented(t); }
Status Visit(const DictionaryType& t) { return NotImplemented(t); }
Status Visit(const ExtensionType& t) { return NotImplemented(t); }
Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
Status Visit(const UnionType& t) { return NotImplemented(t); }
Status Visit(const RunEndEncodedType& t) { return NotImplemented(t); }
Status NotImplemented(const DataType& t) {
return Status::NotImplemented("random generation of arrays of type ", t);
}
Engine& e;
rj::Writer<rj::StringBuffer>& writer;
const GenerateOptions& options;
};
template <typename Engine>
inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
Writer* writer, const GenerateOptions& options) {
if (std::bernoulli_distribution(options.null_probability)(e)) {
writer->Null();
return Status::OK();
}
GenerateImpl<Engine> visitor = {e, *writer, options};
return VisitTypeInline(*type, &visitor);
}
template <typename Engine>
inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
Engine& e, Writer* writer, const GenerateOptions& options) {
RETURN_NOT_OK(OK(writer->StartObject()));
int num_fields = 0;
auto write_field = [&](const Field& f) {
++num_fields;
writer->Key(f.name().c_str());
return Generate(f.type(), e, writer, options);
};
std::bernoulli_distribution bool_dist(options.field_probability);
if (options.randomize_field_order) {
std::vector<size_t> indices;
indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
for (size_t i = 0; i < fields.size(); ++i) {
if (bool_dist(e)) {
indices.push_back(i);
}
}
std::shuffle(indices.begin(), indices.end(), e);
for (auto i : indices) {
RETURN_NOT_OK(write_field(*fields[i]));
}
} else {
for (const auto& f : fields) {
if (bool_dist(e)) {
RETURN_NOT_OK(write_field(*f));
}
}
}
return OK(writer->EndObject(num_fields));
}
inline static Status MakeStream(string_view src_str,
std::shared_ptr<io::InputStream>* out) {
auto src = std::make_shared<Buffer>(src_str);
*out = std::make_shared<io::BufferReader>(src);
return Status::OK();
}
// scalar values (numbers and strings) are parsed into a
// dictionary<index:int32, value:string>. This can be decoded for ease of comparison
inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
std::shared_ptr<Array>* decoded) {
const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
StringBuilder builder;
RETURN_NOT_OK(builder.Resize(indices.length()));
for (int64_t i = 0; i < indices.length(); ++i) {
if (indices.IsNull(i)) {
builder.UnsafeAppendNull();
continue;
}
auto value = dict.GetView(indices.GetView(i));
RETURN_NOT_OK(builder.ReserveData(value.size()));
builder.UnsafeAppend(value);
}
return builder.Finish(decoded);
}
inline static Status ParseFromString(ParseOptions options, string_view src_str,
std::shared_ptr<Array>* parsed) {
auto src = std::make_shared<Buffer>(src_str);
std::unique_ptr<BlockParser> parser;
RETURN_NOT_OK(BlockParser::Make(options, &parser));
RETURN_NOT_OK(parser->Parse(src));
return parser->Finish(parsed);
}
inline static Status ParseFromString(ParseOptions options, string_view src_str,
std::shared_ptr<StructArray>* parsed) {
std::shared_ptr<Array> parsed_non_struct;
RETURN_NOT_OK(ParseFromString(options, src_str, &parsed_non_struct));
*parsed = internal::checked_pointer_cast<StructArray>(parsed_non_struct);
return Status::OK();
}
static inline std::string PrettyPrint(string_view one_line) {
rj::Document document;
// Must pass size to avoid ASAN issues.
document.Parse(one_line.data(), one_line.size());
rj::StringBuffer sb;
rj::PrettyWriter<rj::StringBuffer> writer(sb);
document.Accept(writer);
return sb.GetString();
}
template <typename T>
std::string RowsOfOneColumn(std::string_view name, std::initializer_list<T> values,
decltype(std::to_string(*values.begin()))* = nullptr) {
std::stringstream ss;
for (auto value : values) {
ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
}
return ss.str();
}
inline std::string RowsOfOneColumn(std::string_view name,
std::initializer_list<std::string> values) {
std::stringstream ss;
for (auto value : values) {
ss << R"({")" << name << R"(":)" << value << "}\n";
}
return ss.str();
}
inline static std::string scalars_only_src() {
return R"(
{ "hello": 3.5, "world": false, "yo": "thing" }
{ "hello": 3.25, "world": null }
{ "hello": 3.125, "world": null, "yo": "\u5fcd" }
{ "hello": 0.0, "world": true, "yo": null }
)";
}
inline static std::string nested_src() {
return R"(
{ "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
{ "hello": 3.25, "world": null, "arr": [2], "nuf": null }
{ "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
{ "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
)";
}
inline static std::string null_src() {
return R"(
{ "plain": null, "list1": [], "list2": [], "struct": { "plain": null } }
{ "plain": null, "list1": [], "list2": [null], "struct": {} }
)";
}
inline static std::string unquoted_decimal_src() {
return R"(
{ "price": 30.04, "cost":30.001 }
{ "price": 1.23, "cost":1.229 }
)";
}
inline static std::string mixed_decimal_src() {
return R"(
{ "price": 30.04, "cost": 30.001 }
{ "price": "1.23", "cost": "1.229" }
)";
}
} // namespace json
} // namespace arrow

View File

@@ -0,0 +1,26 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
namespace arrow {
namespace json {
class TableReader;
struct ReadOptions;
struct ParseOptions;
} // namespace json
} // namespace arrow