From 569d0bb3af4cba51ff0e3b736ae9f360beaf90da Mon Sep 17 00:00:00 2001
From: LingBin <lingbinlb@gmail.com>
Date: Tue, 26 Nov 2019 08:22:14 -0600
Subject: [PATCH] Replace all remaining boost::split() with strings::split()
 (#2302)

---
 be/src/exec/csv_scan_node.cpp                 |  6 ---
 be/src/runtime/routine_load/data_consumer.cpp | 21 +++++----
 be/src/runtime/small_file_mgr.cpp             | 16 +++----
 be/src/runtime/user_function_cache.cpp        | 14 +++---
 be/src/service/backend_options.cpp            |  6 +--
 be/src/util/cidr.cpp                          |  5 +-
 be/src/util/disk_info.cpp                     |  5 +-
 be/src/util/file_utils.cpp                    | 46 ++++++++-----------
 be/src/util/mem_info.cpp                      |  6 +--
 be/src/util/string_util.cpp                   |  9 ++--
 10 files changed, 59 insertions(+), 75 deletions(-)
diff --git a/be/src/exec/csv_scan_node.cpp b/be/src/exec/csv_scan_node.cpp
index fbcd3ae50f..4eda57a15f 100644
--- a/be/src/exec/csv_scan_node.cpp
+++ b/be/src/exec/csv_scan_node.cpp
@@ -20,10 +20,6 @@
 #include <string>
 #include <vector>
 
-#include <boost/algorithm/string.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/foreach.hpp>
-
 #include <thrift/protocol/TDebugProtocol.h>
 
 #include "exec/text_converter.hpp"
@@ -61,7 +57,6 @@ public:
     }
 
     char const* c_str() const {
-
         return _begin;
     }
     char const* begin() const {
@@ -559,7 +554,6 @@ bool CsvScanNode::split_check_fill(const std::string& line, RuntimeState* state)
     std::vector<StringRef> fields;
     {
         SCOPED_TIMER(_split_line_timer);
-        // boost::split(fields, line, boost::is_any_of(_column_separator));
         split_line(line, _column_separator[0], fields);
     }
 
diff --git a/be/src/runtime/routine_load/data_consumer.cpp b/be/src/runtime/routine_load/data_consumer.cpp
index f0a6b96e76..ca2675aa78 100644
--- a/be/src/runtime/routine_load/data_consumer.cpp
+++ b/be/src/runtime/routine_load/data_consumer.cpp
@@ -22,6 +22,8 @@
 #include <string>
 #include <vector>
 
+#include "gutil/strings/split.h"
+
 #include "common/status.h"
 #include "service/backend_options.h"
 #include "runtime/small_file_mgr.h"
@@ -34,14 +36,14 @@ namespace doris {
 // init kafka consumer will only set common configs such as
 // brokers, groupid
 Status KafkaDataConsumer::init(StreamLoadContext* ctx) {
-    std::unique_lock<std::mutex> l(_lock); 
+    std::unique_lock<std::mutex> l(_lock);
     if (_init) {
         // this consumer has already been initialized.
         return Status::OK();
     }
 
     RdKafka::Conf *conf = RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL);
-    
+
     // conf has to be deleted finally
     auto conf_deleter = [conf] () { delete conf; };
     DeferOp delete_conf(std::bind<void>(conf_deleter));
@@ -84,8 +86,8 @@ Status KafkaDataConsumer::init(StreamLoadContext* ctx) {
     for (auto& item : ctx->kafka_info->properties) {
         if (boost::algorithm::starts_with(item.second, "FILE:")) {
             // file property should has format: FILE:file_id:md5
-            std::vector<std::string> parts;
-            boost::split(parts, item.second, boost::is_any_of(":"));
+            std::vector<std::string> parts = strings::Split(
+                    item.second, ":", strings::SkipWhitespace());
             if (parts.size() != 3) {
                 return Status::InternalError("PAUSE: Invalid file property of kafka: " + item.second);
             }
@@ -94,7 +96,8 @@ Status KafkaDataConsumer::init(StreamLoadContext* ctx) {
             Status st = ctx->exec_env()->small_file_mgr()->get_file(file_id, parts[2], &file_path);
             if (!st.ok()) {
                 std::stringstream ss;
-                ss << "PAUSE: failed to get file for config: " << item.first << ", error: " << st.get_error_msg();
+                ss << "PAUSE: failed to get file for config: " << item.first
+                    << ", error: " << st.get_error_msg();
                 return Status::InternalError(ss.str());
             }
             RETURN_IF_ERROR(set_conf(item.first, file_path));
@@ -112,7 +115,7 @@ Status KafkaDataConsumer::init(StreamLoadContext* ctx) {
     }
 
     // create consumer
-    _k_consumer = RdKafka::KafkaConsumer::create(conf, errstr); 
+    _k_consumer = RdKafka::KafkaConsumer::create(conf, errstr);
     if (!_k_consumer) {
         LOG(WARNING) << "PAUSE: failed to create kafka consumer: " << errstr;
         return Status::InternalError("PAUSE: failed to create kafka consumer: " + errstr);
@@ -263,7 +266,7 @@ Status KafkaDataConsumer::get_partition_meta(std::vector<int32_t>* partition_ids
         if ((*it)->topic() != _topic) {
             continue;
         }
-        
+
         if ((*it)->err() != RdKafka::ERR_NO_ERROR) {
             std::stringstream ss;
             ss << "error: " << err2str((*it)->err());
@@ -284,7 +287,7 @@ Status KafkaDataConsumer::get_partition_meta(std::vector<int32_t>* partition_ids
         return Status::InternalError("no partition in this topic");
     }
 
-    return Status::OK();    
+    return Status::OK();
 }
 
 Status KafkaDataConsumer::cancel(StreamLoadContext* ctx) {
@@ -309,7 +312,7 @@ Status KafkaDataConsumer::commit(std::vector<RdKafka::TopicPartition*>& offset)
     if (err != RdKafka::ERR_NO_ERROR) {
         std::stringstream ss;
         ss << "failed to commit kafka offset : " << RdKafka::err2str(err);
-        return Status::InternalError(ss.str());                                   
+        return Status::InternalError(ss.str());
     }
     return Status::OK();
 }
diff --git a/be/src/runtime/small_file_mgr.cpp b/be/src/runtime/small_file_mgr.cpp
index 7b6c80fbcb..7954b04e0d 100644
--- a/be/src/runtime/small_file_mgr.cpp
+++ b/be/src/runtime/small_file_mgr.cpp
@@ -21,10 +21,11 @@
 #include <stdio.h>
 #include <sstream>
 
-#include <boost/algorithm/string/split.hpp> // boost::split
 #include <boost/algorithm/string/predicate.hpp> // boost::algorithm::starts_with
 #include <boost/algorithm/string/classification.hpp> // boost::is_any_of
 
+#include "gutil/strings/split.h"
+
 #include "common/status.h"
 #include "env/env.h"
 #include "gen_cpp/HeartbeatService.h"
@@ -74,14 +75,13 @@ Status SmallFileMgr::_load_single_file(
         const std::string& file_name) {
     // file name format should be like:
     // file_id.md5
-    std::vector<std::string> parts;
-    boost::split(parts, file_name, boost::is_any_of("."));
+    std::vector<std::string> parts = strings::Split(file_name, ".");
     if (parts.size() != 2) {
         return Status::InternalError("Not a valid file name: " + file_name);
     }
     int64_t file_id = std::stol(parts[0]);
     std::string md5 = parts[1];
-    
+
     if (_file_cache.find(file_id) != _file_cache.end()) {
         return Status::InternalError("File with same id is already been loaded: " + file_id);
     }
@@ -95,7 +95,7 @@ Status SmallFileMgr::_load_single_file(
     CacheEntry entry;
     entry.path = path + "/" + file_name;
     entry.md5 = file_md5;
-    
+
     _file_cache.emplace(file_id, entry);
     return Status::OK();
 }
@@ -106,7 +106,7 @@ Status SmallFileMgr::get_file(
         std::string* file_path) {
 
     std::unique_lock<std::mutex> l(_lock);
-    // find in cache 
+    // find in cache
     auto it = _file_cache.find(file_id);
     if (it != _file_cache.end()) {
         // find the cached file, check it
@@ -130,7 +130,7 @@ Status SmallFileMgr::get_file(
     // file not found in cache. download it from FE
     RETURN_IF_ERROR(_download_file(file_id, md5, file_path));
 
-    return Status::OK(); 
+    return Status::OK();
 }
 
 Status SmallFileMgr::_check_file(const CacheEntry& entry, const std::string& md5) {
@@ -226,7 +226,7 @@ Status SmallFileMgr::_download_file(
     entry.md5 = md5;
     _file_cache.emplace(file_id, entry);
 
-    *file_path = real_file_path; 
+    *file_path = real_file_path;
 
     LOG(INFO) << "finished to download file: " << file_path;
     return Status::OK();
diff --git a/be/src/runtime/user_function_cache.cpp b/be/src/runtime/user_function_cache.cpp
index 9e15435319..531998045a 100644
--- a/be/src/runtime/user_function_cache.cpp
+++ b/be/src/runtime/user_function_cache.cpp
@@ -20,10 +20,11 @@
 #include <vector>
 #include <regex>
 
-#include <boost/algorithm/string/split.hpp> // boost::split
 #include <boost/algorithm/string/predicate.hpp> // boost::algorithm::ends_with
 #include <boost/algorithm/string/classification.hpp> // boost::is_any_of
 
+#include "gutil/strings/split.h"
+
 #include "env/env.h"
 #include "http/http_client.h"
 #include "util/dynamic_util.h"
@@ -35,7 +36,7 @@ namespace doris {
 
 static const int kLibShardNum = 128;
 
-// function cache entry, store information for 
+// function cache entry, store information for
 struct UserFunctionCacheEntry {
     UserFunctionCacheEntry(int64_t fid_, const std::string& checksum_,
                            const std::string& lib_file_)
@@ -118,7 +119,7 @@ Status UserFunctionCache::init(const std::string& lib_dir) {
     _lib_dir = lib_dir;
     // 1. dynamic open current process
     RETURN_IF_ERROR(dynamic_open(nullptr, &_current_process_handle));
-    // 2. load all cached 
+    // 2. load all cached
     RETURN_IF_ERROR(_load_cached_lib());
     return Status::OK();
 }
@@ -128,8 +129,7 @@ Status UserFunctionCache::_load_entry_from_lib(const std::string& dir, const std
         return Status::InternalError("unknown library file format");
     }
 
-    std::vector<std::string> split_parts;
-    boost::split(split_parts, file, boost::is_any_of("."));
+    std::vector<std::string> split_parts = strings::Split(file, ".");
     if (split_parts.size() != 3) {
         return Status::InternalError("user function's name should be function_id.checksum.so");
     }
@@ -283,7 +283,7 @@ void UserFunctionCache::_destroy_cache_entry(UserFunctionCacheEntry* entry) {
         entry->unref();
     }
     entry->should_delete_library.store(true);
-    // now we need to drop 
+    // now we need to drop
     if (entry->unref()) {
         delete entry;
     }
@@ -353,7 +353,7 @@ Status UserFunctionCache::_download_lib(
             << ", errno=" << errno << ", errmsg=" << strerror_r(errno, buf, 64);
         return Status::InternalError("fail to rename file");
     }
-    
+
     // check download
     entry->is_downloaded = true;
     return Status::OK();
diff --git a/be/src/service/backend_options.cpp b/be/src/service/backend_options.cpp
index 2ac054c0ee..10c7867496 100644
--- a/be/src/service/backend_options.cpp
+++ b/be/src/service/backend_options.cpp
@@ -19,7 +19,7 @@
 
 #include <algorithm>
 
-#include <boost/algorithm/string.hpp>
+#include "gutil/strings/split.h"
 
 #include "common/logging.h"
 #include "common/status.h"
@@ -87,8 +87,8 @@ bool BackendOptions::analyze_priority_cidrs() {
     }
     LOG(INFO) << "priority cidrs in conf: " << config::priority_networks;
 
-    std::vector<std::string> cidr_strs;
-    boost::split(cidr_strs, config::priority_networks, boost::is_any_of(PRIORITY_CIDR_SEPARATOR));
+    std::vector<std::string> cidr_strs = strings::Split(
+            config::priority_networks, PRIORITY_CIDR_SEPARATOR);
 
     for (auto& cidr_str : cidr_strs) {
         CIDR cidr;
diff --git a/be/src/util/cidr.cpp b/be/src/util/cidr.cpp
index cac754ec55..8a70cf6ca0 100644
--- a/be/src/util/cidr.cpp
+++ b/be/src/util/cidr.cpp
@@ -19,7 +19,7 @@
 
 #include <arpa/inet.h>
 
-#include <boost/algorithm/string.hpp>
+#include "gutil/strings/split.h"
 
 #include "common/logging.h"
 
@@ -44,8 +44,7 @@ bool CIDR::reset(const std::string& cidr_str) {
     }
     VLOG(2) << "cidr format str: " << cidr_format_str;
 
-    std::vector<std::string> cidr_items;
-    boost::split(cidr_items, cidr_format_str, boost::is_any_of("/"));
+    std::vector<std::string> cidr_items = strings::Split(cidr_format_str, "/");
     if (cidr_items.size() != 2) {
         LOG(WARNING) << "wrong CIDR format. network=" << cidr_str;
         return false;
diff --git a/be/src/util/disk_info.cpp b/be/src/util/disk_info.cpp
index 56287406e5..be3e2f7af2 100644
--- a/be/src/util/disk_info.cpp
+++ b/be/src/util/disk_info.cpp
@@ -29,6 +29,8 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/algorithm/string/join.hpp>
 
+#include "gutil/strings/split.h"
+
 namespace doris {
 
 bool DiskInfo::_s_initialized;
@@ -52,8 +54,7 @@ void DiskInfo::get_device_names() {
         getline(partitions, line);
         boost::trim(line);
 
-        std::vector<std::string> fields;
-        boost::split(fields, line, boost::is_any_of(" "), boost::token_compress_on);
+        std::vector<std::string> fields = strings::Split(line, " ", strings::SkipWhitespace());
 
         if (fields.size() != 4) {
             continue;
diff --git a/be/src/util/file_utils.cpp b/be/src/util/file_utils.cpp
index 7f5723c83e..e54df9c6c4 100644
--- a/be/src/util/file_utils.cpp
+++ b/be/src/util/file_utils.cpp
@@ -28,11 +28,11 @@
 
 #include <boost/filesystem.hpp>
 #include <boost/system/error_code.hpp>
-#include <boost/algorithm/string/split.hpp>
-#include <boost/algorithm/string/trim.hpp>
 
 #include <openssl/md5.h>
 
+#include "gutil/strings/split.h"
+#include "gutil/strings/strip.h"
 #include "gutil/strings/substitute.h"
 
 #include "env/env.h"
@@ -56,7 +56,7 @@ Status FileUtils::create_dir(const std::string& path, Env* env) {
         bool is_dir = false;
 
         Status s = env->is_directory(partial_path, &is_dir);
-        
+
         if (s.ok()) {
             if (is_dir) {
                 // It's a normal directory.
@@ -66,7 +66,7 @@ Status FileUtils::create_dir(const std::string& path, Env* env) {
             // Maybe a file or a symlink. Let's try to follow the symlink.
             string real_partial_path;
             RETURN_IF_ERROR(env->canonicalize(partial_path, &real_partial_path));
-            
+
             RETURN_IF_ERROR(env->is_directory(real_partial_path, &is_dir));
             if (is_dir) {
                 // It's a symlink to a directory.
@@ -75,7 +75,7 @@ Status FileUtils::create_dir(const std::string& path, Env* env) {
                 return Status::IOError(partial_path + " exists but is not a directory");
             }
         }
-        
+
         RETURN_IF_ERROR(env->create_dir_if_missing(partial_path));
     }
 
@@ -108,7 +108,7 @@ Status FileUtils::remove_all(const std::string& file_path) {
 Status FileUtils::remove(const std::string& path, doris::Env* env) {
     bool is_dir;
     RETURN_IF_ERROR(env->is_directory(path, &is_dir));
- 
+
     if (is_dir) {
         return env->delete_dir(path);
     } else {
@@ -144,10 +144,10 @@ Status FileUtils::list_dirs_files(const std::string& path, std::set<std::string>
         if (is_dot_or_dotdot(name)) {
             return true;
         }
-        
+
         string temp_path =  path + "/" + name;
         bool is_dir;
-        
+
         auto st = env->is_directory(temp_path, &is_dir);
         if (st.ok()) {
             if (is_dir) {
@@ -160,10 +160,10 @@ Status FileUtils::list_dirs_files(const std::string& path, std::set<std::string>
         } else {
             LOG(WARNING) << "check path " << path << "is directory error: " << st.to_string();
         }
-        
+
         return true;
     };
-    
+
     return env->iterate_dir(path, cb);
 }
 
@@ -180,9 +180,9 @@ Status FileUtils::get_children_count(Env* env, const std::string& dir, int64_t*
 bool FileUtils::is_dir(const std::string& file_path, Env* env) {
     bool ret;
     if (env->is_directory(file_path, &ret).ok()) {
-        return ret;   
+        return ret;
     }
-    
+
     return false;
 }
 
@@ -204,18 +204,10 @@ std::string FileUtils::path_of_fd(int fd) {
 
 Status FileUtils::split_pathes(const char* path, std::vector<std::string>* path_vec) {
     path_vec->clear();
-    try {
-        boost::split(*path_vec, path,
-                     boost::is_any_of(";"),
-                     boost::token_compress_on);
-    } catch (...) {
-        std::stringstream ss;
-        ss << "Boost split path failed.[path=" << path << "]";
-        return Status::InternalError(ss.str());
-    }
+    *path_vec = strings::Split(path, ";", strings::SkipWhitespace());
 
     for (std::vector<std::string>::iterator it = path_vec->begin(); it != path_vec->end();) {
-        boost::trim(*it);
+        StripWhiteSpace(&(*it));
 
         it->erase(it->find_last_not_of("/") + 1);
         if (it->size() == 0) {
@@ -243,7 +235,7 @@ Status FileUtils::split_pathes(const char* path, std::vector<std::string>* path_
 }
 
 Status FileUtils::copy_file(const std::string& src_path, const std::string& dest_path) {
-   // open src file 
+   // open src file
     FileHandler src_file;
     if (src_file.open(src_path.c_str(), O_RDONLY) != OLAP_SUCCESS) {
         char errmsg[64];
@@ -258,7 +250,7 @@ Status FileUtils::copy_file(const std::string& src_path, const std::string& dest
         LOG(ERROR) << "open file failed: " << dest_path << strerror_r(errno, errmsg, 64);
         return Status::InternalError("Internal Error");
     }
-    
+
     const int64_t BUF_SIZE = 8192;
     char *buf = new char[BUF_SIZE];
     DeferOp free_buf(std::bind<void>(std::default_delete<char[]>(), buf));
@@ -284,7 +276,7 @@ Status FileUtils::md5sum(const std::string& file, std::string* md5sum) {
     if (fd < 0) {
         return Status::InternalError("failed to open file");
     }
-    
+
     struct stat statbuf;
     if (fstat(fd, &statbuf) < 0) {
         close(fd);
@@ -295,14 +287,14 @@ Status FileUtils::md5sum(const std::string& file, std::string* md5sum) {
 
     unsigned char result[MD5_DIGEST_LENGTH];
     MD5((unsigned char*) buf, file_len, result);
-    munmap(buf, file_len); 
+    munmap(buf, file_len);
 
     std::stringstream ss;
     for (int32_t i = 0; i < MD5_DIGEST_LENGTH; i++) {
         ss << std::setfill('0') << std::setw(2) << std::hex << (int) result[i];
     }
     ss >> *md5sum;
-    
+
     close(fd);
     return Status::OK();
 }
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index 2950e3c4f0..68e65742fa 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -24,8 +24,7 @@
 #include <sstream>
 #include <unistd.h>
 
-#include <boost/algorithm/string.hpp>
-#include <boost/lexical_cast.hpp>
+#include "gutil/strings/split.h"
 
 #include "util/pretty_printer.h"
 #include "util/string_parser.hpp"
@@ -42,8 +41,7 @@ void MemInfo::init() {
 
     while (meminfo.good() && !meminfo.eof()) {
         getline(meminfo, line);
-        std::vector<std::string> fields;
-        boost::split(fields, line, boost::is_any_of(" "), boost::token_compress_on);
+        std::vector<std::string> fields = strings::Split(line, " ", strings::SkipWhitespace());
 
         // We expect lines such as, e.g., 'MemTotal: 16129508 kB'
         if (fields.size() < 3) {
diff --git a/be/src/util/string_util.cpp b/be/src/util/string_util.cpp
index 738b03da8d..b61dbcea7a 100644
--- a/be/src/util/string_util.cpp
+++ b/be/src/util/string_util.cpp
@@ -17,17 +17,14 @@
 
 #include "util/string_util.h"
 
+#include "gutil/strings/split.h"
+
 namespace doris {
 
 std::size_t hash_of_path(const std::string& identifier, const std::string& path) {
     std::size_t hash = std::hash<std::string>()(identifier);
-    std::vector<std::string> path_parts;
-    boost::split(path_parts, path, boost::is_any_of("/"));
+    std::vector<std::string> path_parts = strings::Split(path, "/", strings::SkipWhitespace());
     for (auto& part : path_parts) {
-        if (part.empty()) {
-            continue;
-        }
-
         boost::hash_combine<std::string>(hash, part);
     }
     return hash;