[opt](FileReader) InMemoryReader is only used in s3 (#23486)
If file size < 8MB, the file will be read into memory, and this idea is from https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md#s3inmemoryinputstream. However, in some cases, we only read one or two columns in a file, and the actually required bytes is only 1%, resulting in a multiple fold increase in the amount of data read. Therefore, `InMemoryReader` can only used in object storage, and reduce the threshold.
This commit is contained in:
@ -850,6 +850,9 @@ DEFINE_Validator(jsonb_type_length_soft_limit_bytes,
|
||||
// is greater than object_pool_buffer_size, release the object in the unused_object_pool.
|
||||
DEFINE_Int32(object_pool_buffer_size, "100");
|
||||
|
||||
// Threshold of reading a small file into memory
|
||||
DEFINE_mInt32(in_memory_file_size, "1048576"); // 1MB
|
||||
|
||||
// ParquetReaderWrap prefetch buffer size
|
||||
DEFINE_Int32(parquet_reader_max_buffer_size, "50");
|
||||
// Max size of parquet page header in bytes
|
||||
|
||||
@ -897,6 +897,9 @@ DECLARE_mInt32(jsonb_type_length_soft_limit_bytes);
|
||||
// is greater than object_pool_buffer_size, release the object in the unused_object_pool.
|
||||
DECLARE_Int32(object_pool_buffer_size);
|
||||
|
||||
// Threshold fo reading a small file into memory
|
||||
DECLARE_mInt32(in_memory_file_size);
|
||||
|
||||
// ParquetReaderWrap prefetch buffer size
|
||||
DECLARE_Int32(parquet_reader_max_buffer_size);
|
||||
// Max size of parquet page header in bytes
|
||||
|
||||
@ -778,8 +778,12 @@ Status DelegateReader::create_file_reader(RuntimeProfile* profile,
|
||||
io::FileReaderSPtr reader;
|
||||
RETURN_IF_ERROR(FileFactory::create_file_reader(system_properties, file_description,
|
||||
reader_options, file_system, &reader, profile));
|
||||
if (reader->size() < IN_MEMORY_FILE_SIZE) {
|
||||
*file_reader = std::make_shared<InMemoryFileReader>(reader);
|
||||
if (reader->size() < config::in_memory_file_size) {
|
||||
if (typeid_cast<io::S3FileReader*>(reader.get())) {
|
||||
*file_reader = std::make_shared<InMemoryFileReader>(reader);
|
||||
} else {
|
||||
*file_reader = std::move(reader);
|
||||
}
|
||||
} else if (access_mode == AccessMode::SEQUENTIAL) {
|
||||
bool is_thread_safe = false;
|
||||
if (typeid_cast<io::S3FileReader*>(reader.get())) {
|
||||
|
||||
@ -238,7 +238,7 @@ private:
|
||||
|
||||
/**
|
||||
* Create a file reader suitable for accessing scenarios:
|
||||
* 1. When file size < 8MB, create InMemoryFileReader file reader
|
||||
* 1. When file size < config::in_memory_file_size, create InMemoryFileReader file reader
|
||||
* 2. When reading sequential file(csv/json), create PrefetchBufferedReader
|
||||
* 3. When reading random access file(parquet/orc), create normal file reader
|
||||
*/
|
||||
@ -246,8 +246,6 @@ class DelegateReader {
|
||||
public:
|
||||
enum AccessMode { SEQUENTIAL, RANDOM };
|
||||
|
||||
static constexpr size_t IN_MEMORY_FILE_SIZE = 8 * 1024 * 1024;
|
||||
|
||||
static Status create_file_reader(
|
||||
RuntimeProfile* profile, const FileSystemProperties& system_properties,
|
||||
const FileDescription& file_description, const io::FileReaderOptions& reader_options,
|
||||
|
||||
Reference in New Issue
Block a user