From ed4f523a630d0c718fb7498dcec3265bdc1ea296 Mon Sep 17 00:00:00 2001 From: chejinge Date: Thu, 25 Dec 2025 10:35:09 +0800 Subject: [PATCH 01/10] feat:add big_key --- tools/CMakeLists.txt | 1 + tools/bigkey_analyzer/CMakeLists.txt | 30 + tools/bigkey_analyzer/README.md | 120 ++++ tools/bigkey_analyzer/bigkey_analyzer.cc | 773 +++++++++++++++++++++++ 4 files changed, 924 insertions(+) create mode 100644 tools/bigkey_analyzer/CMakeLists.txt create mode 100644 tools/bigkey_analyzer/README.md create mode 100644 tools/bigkey_analyzer/bigkey_analyzer.cc diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index c1f39f6fd8..949ce94cc1 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(./aof_to_pika) add_subdirectory(./benchmark_client) +add_subdirectory(./bigkey_analyzer) add_subdirectory(./binlog_sender) add_subdirectory(./manifest_generator) add_subdirectory(./rdb_to_pika) diff --git a/tools/bigkey_analyzer/CMakeLists.txt b/tools/bigkey_analyzer/CMakeLists.txt new file mode 100644 index 0000000000..e3182adc9e --- /dev/null +++ b/tools/bigkey_analyzer/CMakeLists.txt @@ -0,0 +1,30 @@ +set(WARNING_FLAGS "-W -Wextra -Wall -Wsign-compare \ +-Wno-unused-parameter -Wno-redundant-decls -Wwrite-strings \ +-Wpointer-arith -Wreorder -Wswitch -Wsign-promo \ +-Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers") + +set(CXXFLAGS "${WARNING_FLAGS} -std=c++17 -g") + +set(SRC_DIR .) +aux_source_directory(${SRC_DIR} BASE_OBJS) + +add_executable(bigkey_analyzer ${BASE_OBJS}) + +target_include_directories(bigkey_analyzer + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/src + ${PROJECT_SOURCE_DIR}/src/storage/include +) + +target_link_libraries(bigkey_analyzer + storage + rocksdb + pthread +) + +set_target_properties(bigkey_analyzer PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + CMAKE_COMPILER_IS_GNUCXX TRUE + COMPILE_FLAGS ${CXXFLAGS} +) diff --git a/tools/bigkey_analyzer/README.md b/tools/bigkey_analyzer/README.md new file mode 100644 index 0000000000..c9b422b525 --- /dev/null +++ b/tools/bigkey_analyzer/README.md @@ -0,0 +1,120 @@ +# Big Key Analyzer + +大key分析工具,用于分析PikiwiDB实例中的大key情况。本工具适用于unstable分支新的存储结构,支持单实例和多DB实例(db/0, db/1, db/2...)。 + +## 功能特点 + +- 支持分析各种数据类型(strings, hashes, lists, sets, zsets)的大key +- 可以按大小过滤key +- 可以限制输出结果数量(top N) +- 支持按key前缀统计 +- 输出结果包含key类型、大小和过期时间(TTL) +- 可以将结果输出到文件 + +## 编译 + +在PikiwiDB根目录下执行: + +```bash +mkdir -p build +cd build +cmake .. +make bigkey_analyzer +``` + +编译完成后,可执行文件会生成在build目录下。 + +## 使用方法 + +``` +Usage: bigkey_analyzer [OPTIONS] +Options: + --min-size=SIZE Only show keys larger than SIZE bytes + --top=N Only show top N largest keys + --prefix-stat Show statistics by key prefix + --prefix-delimiter=C Character used to delimit prefix (default: ':') + --type=TYPE Only analyze specific type (strings|hashes|lists|sets|zsets|all) + --output=FILE Write output to file instead of stdout + --help Display this help message +``` + +## 示例 + +1. 分析所有大key: + +```bash +# 单实例 +./bigkey_analyzer /path/to/pikiwidb/data + +# 多DB实例(db/0, db/1, db/2...) +./bigkey_analyzer /path/to/pikiwidb +``` + +2. 只分析大于1MB的key: + +```bash +./bigkey_analyzer --min-size=1048576 /path/to/pikiwidb/data +``` + +3. 只显示前10个最大的key: + +```bash +./bigkey_analyzer --top=10 /path/to/pikiwidb/data +``` + +4. 只分析hash类型的key: + +```bash +./bigkey_analyzer --type=hashes /path/to/pikiwidb/data +``` + +5. 分析并按前缀统计: + +```bash +./bigkey_analyzer --prefix-stat /path/to/pikiwidb/data +``` + +6. 输出结果到文件: + +```bash +./bigkey_analyzer --output=result.txt /path/to/pikiwidb/data +``` + +## 输出格式 + +工具输出包括三部分: + +1. 大key列表 - 按大小降序排列 +2. 按前缀统计(如果使用--prefix-stat选项) +3. 总结统计信息 + +示例输出: + +``` +===== Big Key Analysis ===== +Type Size Key TTL +hash 1048576 user:profile:1001 -1 +zset 524288 ranking:global 3600 +string 262144 config:settings -1 +... + +===== Key Prefix Statistics ===== +Prefix Count Total Size Avg Size +user 100 10485760 104857.6 +ranking 50 2621440 52428.8 +config 10 524288 52428.8 +... + +===== Summary ===== +Total keys analyzed: 160 +Keys by type: + hash: 50 keys, 25.0 MB total, 524288.0 bytes avg + zset: 30 keys, 15.0 MB total, 524288.0 bytes avg + string: 80 keys, 10.0 MB total, 131072.0 bytes avg +``` + +## 注意事项 + +- 工具只读取数据库,不会进行任何写操作 +- 大key的大小包括key和value的总大小 +- 已过期的key不会被包含在分析结果中 diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc new file mode 100644 index 0000000000..046c393bdb --- /dev/null +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -0,0 +1,773 @@ +// Copyright (c) 2025-present, PikiwiDB Project +// Licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +// This source code is also available under the terms of the GNU General Public License, version 3. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +#include "storage/storage_define.h" +#include "src/storage/src/base_value_format.h" +#include "src/storage/src/base_meta_value_format.h" +#include "src/storage/src/strings_value_format.h" +#include "src/storage/src/base_data_value_format.h" +#include "src/storage/src/coding.h" + +// Utility function to check if a directory exists +bool DirectoryExists(const std::string& path) { + struct stat st; + return stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode); +} + +// Replace special characters for consistent display +std::string ReplaceAll(std::string str, const std::string& from, const std::string& to) { + size_t start_pos = 0; + while((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); + } + return str; +} + +// Decode user key from encoded key +std::string DecodeUserKey(const rocksdb::Slice& encoded_key) { + std::string user_key; + storage::DecodeUserKey(encoded_key.data(), encoded_key.size(), &user_key); + return user_key; +} + +// Print usage information +void PrintUsage() { + std::cout << "Usage: bigkey_analyzer [OPTIONS] " << std::endl; + std::cout << "Options:" << std::endl; + std::cout << " --min-size=SIZE Only show keys larger than SIZE bytes" << std::endl; + std::cout << " --top=N Only show top N largest keys" << std::endl; + std::cout << " --prefix-stat Show statistics by key prefix" << std::endl; + std::cout << " --prefix-delimiter=C Character used to delimit prefix (default: ':')" << std::endl; + std::cout << " --type=TYPE Only analyze specific type (strings|hashes|lists|sets|zsets|all)" << std::endl; + std::cout << " --output=FILE Write output to file instead of stdout" << std::endl; + std::cout << " --help Display this help message" << std::endl; +} + +// Data structure to hold key information +struct KeyInfo { + std::string type; + std::string key; + int64_t size; + int64_t ttl; + + KeyInfo() : type(""), key(""), size(0), ttl(-1) {} + + KeyInfo(const std::string& t, const std::string& k, int64_t s, int64_t tt) + : type(t), key(k), size(s), ttl(tt) {} + + KeyInfo(std::string&& t, std::string&& k, int64_t s, int64_t tt) + : type(std::move(t)), key(std::move(k)), size(s), ttl(tt) {} + + KeyInfo(const char* t, const std::string& k, int64_t s, int64_t tt) + : type(t), key(k), size(s), ttl(tt) {} + + KeyInfo(const char* t, std::string&& k, int64_t s, int64_t tt) + : type(t), key(std::move(k)), size(s), ttl(tt) {} + + bool operator<(const KeyInfo& other) const { + return size > other.size; // Sort in descending order by size + } +}; + +// Data structure for prefix statistics +struct PrefixStat { + size_t count = 0; + int64_t total_size = 0; + + void Add(int64_t size) { + count++; + total_size += size; + } +}; + +// Configuration for the analyzer +struct Config { + std::string db_path; + int64_t min_size = 0; + int top_n = -1; + bool prefix_stat = false; + std::string prefix_delimiter = ":"; + std::string type_filter = "all"; + std::string output_file; +}; + +// Parse command line arguments +bool ParseArgs(int argc, char* argv[], Config& config) { + if (argc < 2) { + PrintUsage(); + return false; + } + + static struct option long_options[] = { + {"min-size", required_argument, 0, 'm'}, + {"top", required_argument, 0, 't'}, + {"prefix-stat", no_argument, 0, 'p'}, + {"prefix-delimiter", required_argument, 0, 'd'}, + {"type", required_argument, 0, 'y'}, + {"output", required_argument, 0, 'o'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int opt; + int option_index = 0; + + config.min_size = 0; + config.top_n = -1; + config.prefix_stat = false; + config.prefix_delimiter = ":"; + config.type_filter = "all"; + + while ((opt = getopt_long(argc, argv, "m:t:pd:y:o:h", long_options, &option_index)) != -1) { + switch (opt) { + case 'm': + config.min_size = std::stoll(optarg); + break; + case 't': + config.top_n = std::stoi(optarg); + break; + case 'p': + config.prefix_stat = true; + break; + case 'd': + config.prefix_delimiter = optarg; + break; + case 'y': + config.type_filter = optarg; + break; + case 'o': + config.output_file = optarg; + break; + case 'h': + PrintUsage(); + return false; + default: + PrintUsage(); + return false; + } + } + + if (optind >= argc) { + std::cerr << "Error: Missing database path" << std::endl; + PrintUsage(); + return false; + } + + config.db_path = argv[optind]; + + if (!DirectoryExists(config.db_path)) { + std::cerr << "Error: Database directory does not exist: " << config.db_path << std::endl; + return false; + } + + return true; +} + +// Analyze strings in MetaCF +void AnalyzeStrings(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, + std::vector& key_infos, const Config& config) { + std::cout << "Analyzing strings..." << std::endl; + + int64_t curtime; + db->GetEnv()->GetCurrentTime(&curtime).ok(); + curtime *= 1000; // Convert to milliseconds + + rocksdb::ReadOptions read_options; + std::unique_ptr iter(db->NewIterator(read_options, meta_handle)); + + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + const rocksdb::Slice& encoded_key = iter->key(); + const rocksdb::Slice& value = iter->value(); + + // Check if this is a string type + if (value.size() < 1) continue; + + storage::DataType type = static_cast(static_cast(value[0])); + if (type != storage::DataType::kStrings) { + continue; + } + + // Decode the user key + std::string user_key = DecodeUserKey(encoded_key); + + // Parse the value + storage::ParsedStringsValue parsed_value(&value.ToString()); + + // Calculate TTL + int64_t ttl = -1; + if (!parsed_value.IsPermanentSurvival()) { + int64_t etime = parsed_value.Etime(); + if (etime > curtime) { + ttl = (etime - curtime) / 1000; // Convert to seconds + } + } + + // Skip if expired + if (parsed_value.IsStale()) { + continue; + } + + int64_t size = encoded_key.size() + value.size(); + + if (size >= config.min_size) { + std::string display_key = ReplaceAll(user_key, "\n", "\\n"); + display_key = ReplaceAll(display_key, " ", "\\x20"); + key_infos.emplace_back("string", std::move(display_key), size, ttl); + } + } + + if (!iter->status().ok()) { + std::cerr << "Error iterating strings: " << iter->status().ToString() << std::endl; + } +} + +// Analyze hashes +void AnalyzeHashes(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, + rocksdb::ColumnFamilyHandle* data_handle, + std::vector& key_infos, const Config& config) { + std::cout << "Analyzing hashes..." << std::endl; + + int64_t curtime; + db->GetEnv()->GetCurrentTime(&curtime).ok(); + curtime *= 1000; // Convert to milliseconds + + rocksdb::ReadOptions read_options; + std::unique_ptr meta_iter(db->NewIterator(read_options, meta_handle)); + + // Map to store hash sizes: encoded_key -> (size, ttl, version) + std::unordered_map> hash_info; + + // First pass: scan metadata + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + const rocksdb::Slice& encoded_key = meta_iter->key(); + const rocksdb::Slice& value = meta_iter->value(); + + if (value.size() < 1) continue; + + storage::DataType type = static_cast(static_cast(value[0])); + if (type != storage::DataType::kHashes) { + continue; + } + + storage::ParsedHashesMetaValue parsed_meta(&value.ToString()); + + // Skip if expired or empty + if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { + continue; + } + + // Calculate TTL + int64_t ttl = -1; + if (!parsed_meta.IsPermanentSurvival()) { + int64_t etime = parsed_meta.Etime(); + if (etime > curtime) { + ttl = (etime - curtime) / 1000; + } + } + + int64_t meta_size = encoded_key.size() + value.size(); + hash_info[encoded_key.ToString()] = std::make_tuple(meta_size, ttl, parsed_meta.Version()); + } + + // Second pass: scan data and accumulate sizes + std::unique_ptr data_iter(db->NewIterator(read_options, data_handle)); + + for (data_iter->SeekToFirst(); data_iter->Valid(); data_iter->Next()) { + const rocksdb::Slice& data_key = data_iter->key(); + const rocksdb::Slice& data_value = data_iter->value(); + + // Extract the encoded user key from data key + // Data key format: encoded_key + version + field + const char* ptr = storage::SeekUserkeyDelim(data_key.data(), data_key.size()); + size_t user_key_len = ptr - data_key.data(); + + if (user_key_len == 0 || user_key_len > data_key.size()) continue; + + std::string encoded_user_key(data_key.data(), user_key_len); + + auto it = hash_info.find(encoded_user_key); + if (it != hash_info.end()) { + std::get<0>(it->second) += data_key.size() + data_value.size(); + } + } + + // Add results + for (const auto& entry : hash_info) { + int64_t size = std::get<0>(entry.second); + if (size >= config.min_size) { + std::string user_key = DecodeUserKey(entry.first); + std::string display_key = ReplaceAll(user_key, "\n", "\\n"); + display_key = ReplaceAll(display_key, " ", "\\x20"); + key_infos.emplace_back("hash", std::move(display_key), size, std::get<1>(entry.second)); + } + } +} + +// Analyze sets +void AnalyzeSets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, + rocksdb::ColumnFamilyHandle* data_handle, + std::vector& key_infos, const Config& config) { + std::cout << "Analyzing sets..." << std::endl; + + int64_t curtime; + db->GetEnv()->GetCurrentTime(&curtime).ok(); + curtime *= 1000; + + rocksdb::ReadOptions read_options; + std::unique_ptr meta_iter(db->NewIterator(read_options, meta_handle)); + + std::unordered_map> set_info; + + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + const rocksdb::Slice& encoded_key = meta_iter->key(); + const rocksdb::Slice& value = meta_iter->value(); + + if (value.size() < 1) continue; + + storage::DataType type = static_cast(static_cast(value[0])); + if (type != storage::DataType::kSets) { + continue; + } + + storage::ParsedSetsMetaValue parsed_meta(&value.ToString()); + + if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { + continue; + } + + int64_t ttl = -1; + if (!parsed_meta.IsPermanentSurvival()) { + int64_t etime = parsed_meta.Etime(); + if (etime > curtime) { + ttl = (etime - curtime) / 1000; + } + } + + int64_t meta_size = encoded_key.size() + value.size(); + set_info[encoded_key.ToString()] = std::make_tuple(meta_size, ttl, parsed_meta.Version()); + } + + std::unique_ptr data_iter(db->NewIterator(read_options, data_handle)); + + for (data_iter->SeekToFirst(); data_iter->Valid(); data_iter->Next()) { + const rocksdb::Slice& data_key = data_iter->key(); + const rocksdb::Slice& data_value = data_iter->value(); + + const char* ptr = storage::SeekUserkeyDelim(data_key.data(), data_key.size()); + size_t user_key_len = ptr - data_key.data(); + + if (user_key_len == 0 || user_key_len > data_key.size()) continue; + + std::string encoded_user_key(data_key.data(), user_key_len); + + auto it = set_info.find(encoded_user_key); + if (it != set_info.end()) { + std::get<0>(it->second) += data_key.size() + data_value.size(); + } + } + + for (const auto& entry : set_info) { + int64_t size = std::get<0>(entry.second); + if (size >= config.min_size) { + std::string user_key = DecodeUserKey(entry.first); + std::string display_key = ReplaceAll(user_key, "\n", "\\n"); + display_key = ReplaceAll(display_key, " ", "\\x20"); + key_infos.emplace_back("set", std::move(display_key), size, std::get<1>(entry.second)); + } + } +} + +// Analyze zsets +void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, + rocksdb::ColumnFamilyHandle* data_handle, + rocksdb::ColumnFamilyHandle* score_handle, + std::vector& key_infos, const Config& config) { + std::cout << "Analyzing zsets..." << std::endl; + + int64_t curtime; + db->GetEnv()->GetCurrentTime(&curtime).ok(); + curtime *= 1000; + + rocksdb::ReadOptions read_options; + std::unique_ptr meta_iter(db->NewIterator(read_options, meta_handle)); + + std::unordered_map> zset_info; + + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + const rocksdb::Slice& encoded_key = meta_iter->key(); + const rocksdb::Slice& value = meta_iter->value(); + + if (value.size() < 1) continue; + + storage::DataType type = static_cast(static_cast(value[0])); + if (type != storage::DataType::kZSets) { + continue; + } + + storage::ParsedZSetsMetaValue parsed_meta(&value.ToString()); + + if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { + continue; + } + + int64_t ttl = -1; + if (!parsed_meta.IsPermanentSurvival()) { + int64_t etime = parsed_meta.Etime(); + if (etime > curtime) { + ttl = (etime - curtime) / 1000; + } + } + + int64_t meta_size = encoded_key.size() + value.size(); + zset_info[encoded_key.ToString()] = std::make_tuple(meta_size, ttl, parsed_meta.Version()); + } + + // Scan data CF + std::unique_ptr data_iter(db->NewIterator(read_options, data_handle)); + + for (data_iter->SeekToFirst(); data_iter->Valid(); data_iter->Next()) { + const rocksdb::Slice& data_key = data_iter->key(); + const rocksdb::Slice& data_value = data_iter->value(); + + const char* ptr = storage::SeekUserkeyDelim(data_key.data(), data_key.size()); + size_t user_key_len = ptr - data_key.data(); + + if (user_key_len == 0 || user_key_len > data_key.size()) continue; + + std::string encoded_user_key(data_key.data(), user_key_len); + + auto it = zset_info.find(encoded_user_key); + if (it != zset_info.end()) { + std::get<0>(it->second) += data_key.size() + data_value.size(); + } + } + + // Scan score CF + std::unique_ptr score_iter(db->NewIterator(read_options, score_handle)); + + for (score_iter->SeekToFirst(); score_iter->Valid(); score_iter->Next()) { + const rocksdb::Slice& score_key = score_iter->key(); + const rocksdb::Slice& score_value = score_iter->value(); + + const char* ptr = storage::SeekUserkeyDelim(score_key.data(), score_key.size()); + size_t user_key_len = ptr - score_key.data(); + + if (user_key_len == 0 || user_key_len > score_key.size()) continue; + + std::string encoded_user_key(score_key.data(), user_key_len); + + auto it = zset_info.find(encoded_user_key); + if (it != zset_info.end()) { + std::get<0>(it->second) += score_key.size() + score_value.size(); + } + } + + for (const auto& entry : zset_info) { + int64_t size = std::get<0>(entry.second); + if (size >= config.min_size) { + std::string user_key = DecodeUserKey(entry.first); + std::string display_key = ReplaceAll(user_key, "\n", "\\n"); + display_key = ReplaceAll(display_key, " ", "\\x20"); + key_infos.emplace_back("zset", std::move(display_key), size, std::get<1>(entry.second)); + } + } +} + +// Analyze lists +void AnalyzeLists(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, + rocksdb::ColumnFamilyHandle* data_handle, + std::vector& key_infos, const Config& config) { + std::cout << "Analyzing lists..." << std::endl; + + int64_t curtime; + db->GetEnv()->GetCurrentTime(&curtime).ok(); + curtime *= 1000; + + rocksdb::ReadOptions read_options; + std::unique_ptr meta_iter(db->NewIterator(read_options, meta_handle)); + + std::unordered_map> list_info; + + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + const rocksdb::Slice& encoded_key = meta_iter->key(); + const rocksdb::Slice& value = meta_iter->value(); + + if (value.size() < 1) continue; + + storage::DataType type = static_cast(static_cast(value[0])); + if (type != storage::DataType::kLists) { + continue; + } + + storage::ParsedListsMetaValue parsed_meta(&value.ToString()); + + if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { + continue; + } + + int64_t ttl = -1; + if (!parsed_meta.IsPermanentSurvival()) { + int64_t etime = parsed_meta.Etime(); + if (etime > curtime) { + ttl = (etime - curtime) / 1000; + } + } + + int64_t meta_size = encoded_key.size() + value.size(); + list_info[encoded_key.ToString()] = std::make_tuple(meta_size, ttl, parsed_meta.Version()); + } + + std::unique_ptr data_iter(db->NewIterator(read_options, data_handle)); + + for (data_iter->SeekToFirst(); data_iter->Valid(); data_iter->Next()) { + const rocksdb::Slice& data_key = data_iter->key(); + const rocksdb::Slice& data_value = data_iter->value(); + + const char* ptr = storage::SeekUserkeyDelim(data_key.data(), data_key.size()); + size_t user_key_len = ptr - data_key.data(); + + if (user_key_len == 0 || user_key_len > data_key.size()) continue; + + std::string encoded_user_key(data_key.data(), user_key_len); + + auto it = list_info.find(encoded_user_key); + if (it != list_info.end()) { + std::get<0>(it->second) += data_key.size() + data_value.size(); + } + } + + for (const auto& entry : list_info) { + int64_t size = std::get<0>(entry.second); + if (size >= config.min_size) { + std::string user_key = DecodeUserKey(entry.first); + std::string display_key = ReplaceAll(user_key, "\n", "\\n"); + display_key = ReplaceAll(display_key, " ", "\\x20"); + key_infos.emplace_back("list", std::move(display_key), size, std::get<1>(entry.second)); + } + } +} + +// Get the prefix of a key +std::string GetKeyPrefix(const std::string& key, const std::string& delimiter) { + size_t pos = key.find(delimiter); + if (pos != std::string::npos) { + return key.substr(0, pos); + } + return key; +} + +// Generate prefix statistics +void GeneratePrefixStats(const std::vector& key_infos, const std::string& delimiter, std::ostream& out) { + std::unordered_map prefix_stats; + + for (const auto& info : key_infos) { + std::string prefix = GetKeyPrefix(info.key, delimiter); + prefix_stats[prefix].Add(info.size); + } + + std::vector> sorted_stats; + for (const auto& entry : prefix_stats) { + sorted_stats.emplace_back(entry); + } + + std::sort(sorted_stats.begin(), sorted_stats.end(), + [](const auto& a, const auto& b) { + return a.second.total_size > b.second.total_size; + }); + + out << "\n===== Key Prefix Statistics =====\n"; + out << "Prefix\tCount\tTotal Size\tAvg Size\n"; + + for (const auto& entry : sorted_stats) { + double avg_size = static_cast(entry.second.total_size) / entry.second.count; + out << entry.first << "\t" + << entry.second.count << "\t" + << entry.second.total_size << "\t" + << avg_size << "\n"; + } +} + +// Analyze a single database instance +void AnalyzeSingleDB(const std::string& db_path, std::vector& key_infos, const Config& config) { + rocksdb::DBOptions db_options; + db_options.create_if_missing = false; + + std::vector cf_names = { + "default", // kMetaCF + "hashes_data_cf", // kHashesDataCF + "sets_data_cf", // kSetsDataCF + "lists_data_cf", // kListsDataCF + "zsets_data_cf", // kZsetsDataCF + "zsets_score_cf", // kZsetsScoreCF + "streams_data_cf" // kStreamsDataCF + }; + + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + cf_name, rocksdb::ColumnFamilyOptions())); + } + + std::vector handles; + rocksdb::DB* db; + rocksdb::Status status = rocksdb::DB::OpenForReadOnly(db_options, db_path, + column_families, &handles, &db); + + if (!status.ok()) { + std::cerr << "Error opening database at " << db_path << ": " << status.ToString() << std::endl; + return; + } + + std::cout << "Analyzing database at " << db_path << std::endl; + + // Analyze each type + if (config.type_filter == "all" || config.type_filter == "strings") { + AnalyzeStrings(db, handles[storage::kMetaCF], key_infos, config); + } + + if (config.type_filter == "all" || config.type_filter == "hashes") { + AnalyzeHashes(db, handles[storage::kMetaCF], handles[storage::kHashesDataCF], key_infos, config); + } + + if (config.type_filter == "all" || config.type_filter == "sets") { + AnalyzeSets(db, handles[storage::kMetaCF], handles[storage::kSetsDataCF], key_infos, config); + } + + if (config.type_filter == "all" || config.type_filter == "zsets") { + AnalyzeZsets(db, handles[storage::kMetaCF], handles[storage::kZsetsDataCF], + handles[storage::kZsetsScoreCF], key_infos, config); + } + + if (config.type_filter == "all" || config.type_filter == "lists") { + AnalyzeLists(db, handles[storage::kMetaCF], handles[storage::kListsDataCF], key_infos, config); + } + + // Cleanup + for (auto handle : handles) { + delete handle; + } + delete db; +} + +int main(int argc, char *argv[]){ + Config config; + if (!ParseArgs(argc, argv, config)) { + return 1; + } + + std::vector key_infos; + + std::unique_ptr file_out; + std::ostream* out = &std::cout; + + if (!config.output_file.empty()) { + file_out = std::make_unique(config.output_file); + if (!file_out->is_open()) { + std::cerr << "Error opening output file: " << config.output_file << std::endl; + return 1; + } + out = file_out.get(); + } + + // Check if this is a single DB or multiple DB instances + // Try to detect db/0, db/1, db/2, etc. + std::vector db_paths; + + // First, check if db_path itself is a valid RocksDB + std::string test_path = config.db_path; + if (DirectoryExists(test_path + "/CURRENT")) { + // This is a single database instance + db_paths.push_back(test_path); + std::cout << "Detected single database instance" << std::endl; + } else { + // Check for multiple database instances (db/0, db/1, db/2, ...) + int db_index = 0; + while (true) { + std::string db_inst_path = config.db_path + "/db/" + std::to_string(db_index); + if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { + db_paths.push_back(db_inst_path); + db_index++; + } else { + break; + } + } + + if (db_paths.empty()) { + std::cerr << "Error: No valid database found at " << config.db_path << std::endl; + std::cerr << "Checked for single instance and db/0, db/1, ... directories" << std::endl; + return 1; + } + + std::cout << "Detected " << db_paths.size() << " database instances" << std::endl; + } + + // Analyze each database instance + for (const auto& db_path : db_paths) { + AnalyzeSingleDB(db_path, key_infos, config); + } + + // Sort keys by size + std::sort(key_infos.begin(), key_infos.end()); + + // Limit to top N if requested + if (config.top_n > 0 && config.top_n < static_cast(key_infos.size())) { + key_infos.resize(config.top_n); + } + + // Output results + *out << "===== Big Key Analysis =====\n"; + *out << "Type\tSize\tKey\tTTL\n"; + + for (const auto& info : key_infos) { + *out << info.type << "\t" << info.size << "\t" << info.key << "\t" << info.ttl << "\n"; + } + + // Generate prefix statistics if requested + if (config.prefix_stat) { + GeneratePrefixStats(key_infos, config.prefix_delimiter, *out); + } + + // Output summary + *out << "\n===== Summary =====\n"; + *out << "Total keys analyzed: " << key_infos.size() << "\n"; + + std::unordered_map type_counts; + std::unordered_map type_sizes; + + for (const auto& info : key_infos) { + type_counts[info.type]++; + type_sizes[info.type] += info.size; + } + + *out << "Keys by type:\n"; + for (const auto& entry : type_counts) { + double avg_size = static_cast(type_sizes[entry.first]) / entry.second; + double mb_size = static_cast(type_sizes[entry.first]) / (1024 * 1024); + + *out << " " << entry.first << ": " << entry.second << " keys, " + << mb_size << " MB total, " + << avg_size << " bytes avg\n"; + } + + return 0; +} From 5488d60e0e1c5af5c026bed3cbcd88a211a818d7 Mon Sep 17 00:00:00 2001 From: chejinge Date: Thu, 25 Dec 2025 10:40:42 +0800 Subject: [PATCH 02/10] feat:add big_key --- tools/bigkey_analyzer/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bigkey_analyzer/CMakeLists.txt b/tools/bigkey_analyzer/CMakeLists.txt index e3182adc9e..6c0c545a75 100644 --- a/tools/bigkey_analyzer/CMakeLists.txt +++ b/tools/bigkey_analyzer/CMakeLists.txt @@ -19,7 +19,6 @@ target_include_directories(bigkey_analyzer target_link_libraries(bigkey_analyzer storage - rocksdb pthread ) From 10b8a9abecad681c6a85c9481c1ad5209cb12190 Mon Sep 17 00:00:00 2001 From: chejinge Date: Thu, 25 Dec 2025 11:08:16 +0800 Subject: [PATCH 03/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index 046c393bdb..532e41a124 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -25,6 +25,7 @@ #include "src/storage/src/base_meta_value_format.h" #include "src/storage/src/strings_value_format.h" #include "src/storage/src/base_data_value_format.h" +#include "src/storage/src/lists_meta_value_format.h" #include "src/storage/src/coding.h" // Utility function to check if a directory exists @@ -424,7 +425,8 @@ void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, continue; } - storage::ParsedZSetsMetaValue parsed_meta(&value.ToString()); + std::string value_str = value.ToString(); + storage::ParsedZSetsMetaValue parsed_meta(&value_str); if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { continue; @@ -519,7 +521,8 @@ void AnalyzeLists(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, continue; } - storage::ParsedListsMetaValue parsed_meta(&value.ToString()); + std::string value_str = value.ToString(); + storage::ParsedListsMetaValue parsed_meta(&value_str); if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { continue; From 23668f3d8c37e76c6a6f808ad98c04259cc5d942 Mon Sep 17 00:00:00 2001 From: chejinge Date: Thu, 25 Dec 2025 13:09:37 +0800 Subject: [PATCH 04/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index 532e41a124..e99c5472ec 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -212,7 +212,8 @@ void AnalyzeStrings(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, std::string user_key = DecodeUserKey(encoded_key); // Parse the value - storage::ParsedStringsValue parsed_value(&value.ToString()); + std::string value_str = value.ToString(); + storage::ParsedStringsValue parsed_value(&value_str); // Calculate TTL int64_t ttl = -1; @@ -270,7 +271,8 @@ void AnalyzeHashes(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, continue; } - storage::ParsedHashesMetaValue parsed_meta(&value.ToString()); + std::string value_str = value.ToString(); + storage::ParsedHashesMetaValue parsed_meta(&value_str); // Skip if expired or empty if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { @@ -350,7 +352,8 @@ void AnalyzeSets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, continue; } - storage::ParsedSetsMetaValue parsed_meta(&value.ToString()); + std::string value_str = value.ToString(); + storage::ParsedSetsMetaValue parsed_meta(&value_str); if (parsed_meta.IsStale() || parsed_meta.Count() == 0) { continue; From be1e4bded9f9cbc91b1f310bd6d9731450d1bfcd Mon Sep 17 00:00:00 2001 From: chejinge Date: Mon, 29 Dec 2025 14:18:45 +0800 Subject: [PATCH 05/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index e99c5472ec..562cda4f52 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -26,6 +26,8 @@ #include "src/storage/src/strings_value_format.h" #include "src/storage/src/base_data_value_format.h" #include "src/storage/src/lists_meta_value_format.h" +#include "src/storage/src/lists_data_key_format.h" +#include "src/storage/src/zsets_data_key_format.h" #include "src/storage/src/coding.h" // Utility function to check if a directory exists @@ -474,16 +476,18 @@ void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, const rocksdb::Slice& score_key = score_iter->key(); const rocksdb::Slice& score_value = score_iter->value(); - const char* ptr = storage::SeekUserkeyDelim(score_key.data(), score_key.size()); - size_t user_key_len = ptr - score_key.data(); - - if (user_key_len == 0 || user_key_len > score_key.size()) continue; - - std::string encoded_user_key(score_key.data(), user_key_len); - - auto it = zset_info.find(encoded_user_key); - if (it != zset_info.end()) { - std::get<0>(it->second) += score_key.size() + score_value.size(); + // Parse the score key using ParsedZSetsScoreKey + try { + storage::ParsedZSetsScoreKey parsed_key(score_key); + std::string encoded_user_key = parsed_key.key().ToString(); + + auto it = zset_info.find(encoded_user_key); + if (it != zset_info.end()) { + std::get<0>(it->second) += score_key.size() + score_value.size(); + } + } catch (...) { + // Skip malformed keys + continue; } } @@ -549,16 +553,18 @@ void AnalyzeLists(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, const rocksdb::Slice& data_key = data_iter->key(); const rocksdb::Slice& data_value = data_iter->value(); - const char* ptr = storage::SeekUserkeyDelim(data_key.data(), data_key.size()); - size_t user_key_len = ptr - data_key.data(); - - if (user_key_len == 0 || user_key_len > data_key.size()) continue; - - std::string encoded_user_key(data_key.data(), user_key_len); - - auto it = list_info.find(encoded_user_key); - if (it != list_info.end()) { - std::get<0>(it->second) += data_key.size() + data_value.size(); + // Parse the data key using ParsedListsDataKey + try { + storage::ParsedListsDataKey parsed_key(data_key); + std::string encoded_user_key = parsed_key.key().ToString(); + + auto it = list_info.find(encoded_user_key); + if (it != list_info.end()) { + std::get<0>(it->second) += data_key.size() + data_value.size(); + } + } catch (...) { + // Skip malformed keys + continue; } } From fdc4e5dc9d53471e22c20bd033735e5fcad4f0f6 Mon Sep 17 00:00:00 2001 From: chejinge Date: Tue, 30 Dec 2025 16:32:13 +0800 Subject: [PATCH 06/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 28 +++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index 562cda4f52..f256e1728f 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -702,7 +702,6 @@ int main(int argc, char *argv[]){ } // Check if this is a single DB or multiple DB instances - // Try to detect db/0, db/1, db/2, etc. std::vector db_paths; // First, check if db_path itself is a valid RocksDB @@ -712,21 +711,34 @@ int main(int argc, char *argv[]){ db_paths.push_back(test_path); std::cout << "Detected single database instance" << std::endl; } else { - // Check for multiple database instances (db/0, db/1, db/2, ...) - int db_index = 0; - while (true) { - std::string db_inst_path = config.db_path + "/db/" + std::to_string(db_index); + // Check if each subdirectory is a valid RocksDB (direct subdirectories like 0/, 1/, 2/) + for (int db_index = 0; db_index < 1000; db_index++) { // 防止无限循环,设置上限 + std::string db_inst_path = config.db_path + "/" + std::to_string(db_index); if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { db_paths.push_back(db_inst_path); - db_index++; - } else { + } else if (db_index > 0 && !DirectoryExists(db_inst_path)) { + // 如果目录不存在且已找到至少一个DB,则认为已到达末尾 break; } } + // 如果上面的检测失败,尝试经典的db/N格式 + if (db_paths.empty()) { + int db_index = 0; + while (true) { + std::string db_inst_path = config.db_path + "/db/" + std::to_string(db_index); + if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { + db_paths.push_back(db_inst_path); + db_index++; + } else { + break; + } + } + } + if (db_paths.empty()) { std::cerr << "Error: No valid database found at " << config.db_path << std::endl; - std::cerr << "Checked for single instance and db/0, db/1, ... directories" << std::endl; + std::cerr << "Checked for single instance, direct subdirectories (0, 1, 2...), and db/0, db/1, ... directories" << std::endl; return 1; } From 33ef7e2ce9dbbe4f72506b56c36b1fe41c83f911 Mon Sep 17 00:00:00 2001 From: chejinge Date: Tue, 30 Dec 2025 18:42:59 +0800 Subject: [PATCH 07/10] feat:add big_key --- tools/bigkey_analyzer/README.md | 14 ++- tools/bigkey_analyzer/bigkey_analyzer.cc | 127 +++++++++++++++-------- 2 files changed, 95 insertions(+), 46 deletions(-) diff --git a/tools/bigkey_analyzer/README.md b/tools/bigkey_analyzer/README.md index c9b422b525..184520bc9b 100644 --- a/tools/bigkey_analyzer/README.md +++ b/tools/bigkey_analyzer/README.md @@ -1,6 +1,10 @@ # Big Key Analyzer -大key分析工具,用于分析PikiwiDB实例中的大key情况。本工具适用于unstable分支新的存储结构,支持单实例和多DB实例(db/0, db/1, db/2...)。 +大key分析工具,用于分析PikiwiDB实例中的大key情况。本工具适用于unstable分支新的存储结构,支持多种目录结构: +- 单实例 RocksDB +- 多DB实例 (db/0, db/1, db/2...) +- 直接分区目录 (0/, 1/, 2/...) +- **新增**: dbN/M 三层嵌套结构 (db0/0, db0/1, db1/0...) ## 功能特点 @@ -92,10 +96,10 @@ Options: ``` ===== Big Key Analysis ===== -Type Size Key TTL -hash 1048576 user:profile:1001 -1 -zset 524288 ranking:global 3600 -string 262144 config:settings -1 +DB Partition Type Size Key TTL +db0 1 hash 1048576 user:profile:1001 -1 +db0 2 zset 524288 ranking:global 3600 +db1 0 string 262144 config:settings -1 ... ===== Key Prefix Statistics ===== diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index f256e1728f..2b40b64ed8 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -72,20 +72,26 @@ struct KeyInfo { std::string key; int64_t size; int64_t ttl; + std::string db_name; + std::string partition; - KeyInfo() : type(""), key(""), size(0), ttl(-1) {} + KeyInfo() : type(""), key(""), size(0), ttl(-1), db_name(""), partition("") {} - KeyInfo(const std::string& t, const std::string& k, int64_t s, int64_t tt) - : type(t), key(k), size(s), ttl(tt) {} + KeyInfo(const std::string& t, const std::string& k, int64_t s, int64_t tt, + const std::string& db = "", const std::string& part = "") + : type(t), key(k), size(s), ttl(tt), db_name(db), partition(part) {} - KeyInfo(std::string&& t, std::string&& k, int64_t s, int64_t tt) - : type(std::move(t)), key(std::move(k)), size(s), ttl(tt) {} + KeyInfo(std::string&& t, std::string&& k, int64_t s, int64_t tt, + const std::string& db = "", const std::string& part = "") + : type(std::move(t)), key(std::move(k)), size(s), ttl(tt), db_name(db), partition(part) {} - KeyInfo(const char* t, const std::string& k, int64_t s, int64_t tt) - : type(t), key(k), size(s), ttl(tt) {} + KeyInfo(const char* t, const std::string& k, int64_t s, int64_t tt, + const std::string& db = "", const std::string& part = "") + : type(t), key(k), size(s), ttl(tt), db_name(db), partition(part) {} - KeyInfo(const char* t, std::string&& k, int64_t s, int64_t tt) - : type(t), key(std::move(k)), size(s), ttl(tt) {} + KeyInfo(const char* t, std::string&& k, int64_t s, int64_t tt, + const std::string& db = "", const std::string& part = "") + : type(t), key(std::move(k)), size(s), ttl(tt), db_name(db), partition(part) {} bool operator<(const KeyInfo& other) const { return size > other.size; // Sort in descending order by size @@ -188,7 +194,8 @@ bool ParseArgs(int argc, char* argv[], Config& config) { // Analyze strings in MetaCF void AnalyzeStrings(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, - std::vector& key_infos, const Config& config) { + std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { std::cout << "Analyzing strings..." << std::endl; int64_t curtime; @@ -236,7 +243,7 @@ void AnalyzeStrings(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, if (size >= config.min_size) { std::string display_key = ReplaceAll(user_key, "\n", "\\n"); display_key = ReplaceAll(display_key, " ", "\\x20"); - key_infos.emplace_back("string", std::move(display_key), size, ttl); + key_infos.emplace_back("string", std::move(display_key), size, ttl, db_name, partition); } } @@ -248,7 +255,8 @@ void AnalyzeStrings(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, // Analyze hashes void AnalyzeHashes(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, rocksdb::ColumnFamilyHandle* data_handle, - std::vector& key_infos, const Config& config) { + std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { std::cout << "Analyzing hashes..." << std::endl; int64_t curtime; @@ -323,7 +331,7 @@ void AnalyzeHashes(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, std::string user_key = DecodeUserKey(entry.first); std::string display_key = ReplaceAll(user_key, "\n", "\\n"); display_key = ReplaceAll(display_key, " ", "\\x20"); - key_infos.emplace_back("hash", std::move(display_key), size, std::get<1>(entry.second)); + key_infos.emplace_back("hash", std::move(display_key), size, std::get<1>(entry.second), db_name, partition); } } } @@ -331,7 +339,8 @@ void AnalyzeHashes(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, // Analyze sets void AnalyzeSets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, rocksdb::ColumnFamilyHandle* data_handle, - std::vector& key_infos, const Config& config) { + std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { std::cout << "Analyzing sets..." << std::endl; int64_t curtime; @@ -398,7 +407,7 @@ void AnalyzeSets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, std::string user_key = DecodeUserKey(entry.first); std::string display_key = ReplaceAll(user_key, "\n", "\\n"); display_key = ReplaceAll(display_key, " ", "\\x20"); - key_infos.emplace_back("set", std::move(display_key), size, std::get<1>(entry.second)); + key_infos.emplace_back("set", std::move(display_key), size, std::get<1>(entry.second), db_name, partition); } } } @@ -407,7 +416,8 @@ void AnalyzeSets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, rocksdb::ColumnFamilyHandle* data_handle, rocksdb::ColumnFamilyHandle* score_handle, - std::vector& key_infos, const Config& config) { + std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { std::cout << "Analyzing zsets..." << std::endl; int64_t curtime; @@ -497,7 +507,7 @@ void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, std::string user_key = DecodeUserKey(entry.first); std::string display_key = ReplaceAll(user_key, "\n", "\\n"); display_key = ReplaceAll(display_key, " ", "\\x20"); - key_infos.emplace_back("zset", std::move(display_key), size, std::get<1>(entry.second)); + key_infos.emplace_back("zset", std::move(display_key), size, std::get<1>(entry.second), db_name, partition); } } } @@ -505,7 +515,8 @@ void AnalyzeZsets(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, // Analyze lists void AnalyzeLists(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, rocksdb::ColumnFamilyHandle* data_handle, - std::vector& key_infos, const Config& config) { + std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { std::cout << "Analyzing lists..." << std::endl; int64_t curtime; @@ -574,7 +585,7 @@ void AnalyzeLists(rocksdb::DB* db, rocksdb::ColumnFamilyHandle* meta_handle, std::string user_key = DecodeUserKey(entry.first); std::string display_key = ReplaceAll(user_key, "\n", "\\n"); display_key = ReplaceAll(display_key, " ", "\\x20"); - key_infos.emplace_back("list", std::move(display_key), size, std::get<1>(entry.second)); + key_infos.emplace_back("list", std::move(display_key), size, std::get<1>(entry.second), db_name, partition); } } } @@ -620,7 +631,8 @@ void GeneratePrefixStats(const std::vector& key_infos, const std::strin } // Analyze a single database instance -void AnalyzeSingleDB(const std::string& db_path, std::vector& key_infos, const Config& config) { +void AnalyzeSingleDB(const std::string& db_path, std::vector& key_infos, const Config& config, + const std::string& db_name, const std::string& partition) { rocksdb::DBOptions db_options; db_options.create_if_missing = false; @@ -654,24 +666,24 @@ void AnalyzeSingleDB(const std::string& db_path, std::vector& key_infos // Analyze each type if (config.type_filter == "all" || config.type_filter == "strings") { - AnalyzeStrings(db, handles[storage::kMetaCF], key_infos, config); + AnalyzeStrings(db, handles[storage::kMetaCF], key_infos, config, db_name, partition); } if (config.type_filter == "all" || config.type_filter == "hashes") { - AnalyzeHashes(db, handles[storage::kMetaCF], handles[storage::kHashesDataCF], key_infos, config); + AnalyzeHashes(db, handles[storage::kMetaCF], handles[storage::kHashesDataCF], key_infos, config, db_name, partition); } if (config.type_filter == "all" || config.type_filter == "sets") { - AnalyzeSets(db, handles[storage::kMetaCF], handles[storage::kSetsDataCF], key_infos, config); + AnalyzeSets(db, handles[storage::kMetaCF], handles[storage::kSetsDataCF], key_infos, config, db_name, partition); } if (config.type_filter == "all" || config.type_filter == "zsets") { AnalyzeZsets(db, handles[storage::kMetaCF], handles[storage::kZsetsDataCF], - handles[storage::kZsetsScoreCF], key_infos, config); + handles[storage::kZsetsScoreCF], key_infos, config, db_name, partition); } if (config.type_filter == "all" || config.type_filter == "lists") { - AnalyzeLists(db, handles[storage::kMetaCF], handles[storage::kListsDataCF], key_infos, config); + AnalyzeLists(db, handles[storage::kMetaCF], handles[storage::kListsDataCF], key_infos, config, db_name, partition); } // Cleanup @@ -702,33 +714,61 @@ int main(int argc, char *argv[]){ } // Check if this is a single DB or multiple DB instances - std::vector db_paths; + std::vector> db_paths; // (path, db_name, partition) // First, check if db_path itself is a valid RocksDB std::string test_path = config.db_path; if (DirectoryExists(test_path + "/CURRENT")) { // This is a single database instance - db_paths.push_back(test_path); + db_paths.push_back(std::make_tuple(test_path, "", "")); std::cout << "Detected single database instance" << std::endl; } else { - // Check if each subdirectory is a valid RocksDB (direct subdirectories like 0/, 1/, 2/) - for (int db_index = 0; db_index < 1000; db_index++) { // 防止无限循环,设置上限 - std::string db_inst_path = config.db_path + "/" + std::to_string(db_index); - if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { - db_paths.push_back(db_inst_path); - } else if (db_index > 0 && !DirectoryExists(db_inst_path)) { - // 如果目录不存在且已找到至少一个DB,则认为已到达末尾 + // 尝试检测dbN/M/格式 (如 db0/1, db0/2, db1/0 等) + bool found_dbn_format = false; + for (int db_index = 0; db_index < 1000; db_index++) { + std::string db_name = "db" + std::to_string(db_index); + std::string db_dir = config.db_path + "/" + db_name; + + if (DirectoryExists(db_dir)) { + // 检查这个db下的所有分区子目录 + bool found_partitions = false; + for (int partition = 0; partition < 1000; partition++) { + std::string partition_path = db_dir + "/" + std::to_string(partition); + if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(partition_path, db_name, std::to_string(partition))); + found_partitions = true; + found_dbn_format = true; + } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { + // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 + break; + } + } + } else if (db_index > 0 && found_dbn_format) { + // 当前db不存在且已找到至少一个db,认为已到达末尾 break; } } - // 如果上面的检测失败,尝试经典的db/N格式 + // 如果没有找到dbN/M格式,尝试检测直接的分区目录格式 (如 0/, 1/, 2/) + if (db_paths.empty()) { + for (int db_index = 0; db_index < 1000; db_index++) { + std::string db_inst_path = config.db_path + "/" + std::to_string(db_index); + if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(db_inst_path, "", std::to_string(db_index))); + } else if (db_index > 0 && !DirectoryExists(db_inst_path) && !db_paths.empty()) { + // 如果目录不存在且已找到至少一个DB,则认为已到达末尾 + break; + } + } + } + + // 尝试经典的db/N格式 if (db_paths.empty()) { int db_index = 0; while (true) { std::string db_inst_path = config.db_path + "/db/" + std::to_string(db_index); if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { - db_paths.push_back(db_inst_path); + db_paths.push_back(std::make_tuple(db_inst_path, "db", std::to_string(db_index))); db_index++; } else { break; @@ -738,7 +778,7 @@ int main(int argc, char *argv[]){ if (db_paths.empty()) { std::cerr << "Error: No valid database found at " << config.db_path << std::endl; - std::cerr << "Checked for single instance, direct subdirectories (0, 1, 2...), and db/0, db/1, ... directories" << std::endl; + std::cerr << "Checked for single instance, dbN/M format, direct subdirectories (0, 1, 2...), and db/0, db/1, ... directories" << std::endl; return 1; } @@ -746,8 +786,11 @@ int main(int argc, char *argv[]){ } // Analyze each database instance - for (const auto& db_path : db_paths) { - AnalyzeSingleDB(db_path, key_infos, config); + for (const auto& db_info : db_paths) { + const std::string& db_path = std::get<0>(db_info); + const std::string& db_name = std::get<1>(db_info); + const std::string& partition = std::get<2>(db_info); + AnalyzeSingleDB(db_path, key_infos, config, db_name, partition); } // Sort keys by size @@ -760,10 +803,12 @@ int main(int argc, char *argv[]){ // Output results *out << "===== Big Key Analysis =====\n"; - *out << "Type\tSize\tKey\tTTL\n"; + *out << "DB\tPartition\tType\tSize\tKey\tTTL\n"; for (const auto& info : key_infos) { - *out << info.type << "\t" << info.size << "\t" << info.key << "\t" << info.ttl << "\n"; + *out << info.db_name << "\t" << info.partition << "\t" + << info.type << "\t" << info.size << "\t" + << info.key << "\t" << info.ttl << "\n"; } // Generate prefix statistics if requested From 5d2930843a9f38a484be0dfdc32a51812b18564e Mon Sep 17 00:00:00 2001 From: chejinge Date: Tue, 30 Dec 2025 18:47:15 +0800 Subject: [PATCH 08/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 85 ++++++++++++++++++------ 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index 2b40b64ed8..d705c1965a 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -723,29 +723,74 @@ int main(int argc, char *argv[]){ db_paths.push_back(std::make_tuple(test_path, "", "")); std::cout << "Detected single database instance" << std::endl; } else { - // 尝试检测dbN/M/格式 (如 db0/1, db0/2, db1/0 等) - bool found_dbn_format = false; - for (int db_index = 0; db_index < 1000; db_index++) { - std::string db_name = "db" + std::to_string(db_index); - std::string db_dir = config.db_path + "/" + db_name; + // 处理几种常见的情况: + + // 1. 如果输入路径本身是dbN格式,直接检测其子目录(无需额外的dbN前缀) + std::string db_name_input = ""; + bool is_db_dir = false; + std::string db_dir = config.db_path; + + // 检查输入路径的末尾目录名是否匹配dbN模式 + size_t last_slash = config.db_path.find_last_of("/\\"); + if (last_slash != std::string::npos) { + std::string dir_name = config.db_path.substr(last_slash + 1); + if (dir_name.size() > 2 && dir_name.substr(0, 2) == "db" && + std::all_of(dir_name.begin() + 2, dir_name.end(), ::isdigit)) { + db_name_input = dir_name; + is_db_dir = true; + } + } + + // 如果是dbN格式目录,直接检查其下的子目录 + if (is_db_dir) { + // 检查这个db下的所有分区子目录 + bool found_partitions = false; + for (int partition = 0; partition < 1000; partition++) { + std::string partition_path = db_dir + "/" + std::to_string(partition); + if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(partition_path, db_name_input, std::to_string(partition))); + found_partitions = true; + } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { + // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 + break; + } + } - if (DirectoryExists(db_dir)) { - // 检查这个db下的所有分区子目录 - bool found_partitions = false; - for (int partition = 0; partition < 1000; partition++) { - std::string partition_path = db_dir + "/" + std::to_string(partition); - if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { - db_paths.push_back(std::make_tuple(partition_path, db_name, std::to_string(partition))); - found_partitions = true; - found_dbn_format = true; - } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { - // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 - break; + // 如果在dbN目录下找到了有效的子目录,就不需要继续搜索其他格式了 + if (!db_paths.empty()) { + std::cout << "Detected " << db_paths.size() << " database partitions in " << db_name_input << std::endl; + } + } + + // 2. 如果上面的检测未能找到数据库,尝试标准的dbN/M格式 + if (db_paths.empty()) { + bool found_dbn_format = false; + for (int db_index = 0; db_index < 1000; db_index++) { + std::string db_name = "db" + std::to_string(db_index); + std::string db_dir = config.db_path + "/" + db_name; + + if (DirectoryExists(db_dir)) { + // 检查这个db下的所有分区子目录 + bool found_partitions = false; + for (int partition = 0; partition < 1000; partition++) { + std::string partition_path = db_dir + "/" + std::to_string(partition); + if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(partition_path, db_name, std::to_string(partition))); + found_partitions = true; + found_dbn_format = true; + } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { + // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 + break; + } } + } else if (db_index > 0 && found_dbn_format) { + // 当前db不存在且已找到至少一个db,认为已到达末尾 + break; } - } else if (db_index > 0 && found_dbn_format) { - // 当前db不存在且已找到至少一个db,认为已到达末尾 - break; + } + + if (found_dbn_format) { + std::cout << "Detected " << db_paths.size() << " database partitions in dbN/M format" << std::endl; } } From 71a65d6de2869b8e91c94f0de439682b24f3d033 Mon Sep 17 00:00:00 2001 From: chejinge Date: Tue, 30 Dec 2025 19:14:38 +0800 Subject: [PATCH 09/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index d705c1965a..101191321d 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "rocksdb/options.h" #include "rocksdb/db.h" @@ -716,6 +718,9 @@ int main(int argc, char *argv[]){ // Check if this is a single DB or multiple DB instances std::vector> db_paths; // (path, db_name, partition) + // 先显示当前正在检测的路径,帮助调试 + std::cout << "Checking path: " << config.db_path << std::endl; + // First, check if db_path itself is a valid RocksDB std::string test_path = config.db_path; if (DirectoryExists(test_path + "/CURRENT")) { @@ -725,6 +730,39 @@ int main(int argc, char *argv[]){ } else { // 处理几种常见的情况: + // 调试信息:显示目录内容 + std::cout << "Directory contents of " << config.db_path << ":" << std::endl; + DIR* dir = opendir(config.db_path.c_str()); + if (dir) { + struct dirent* entry; + while ((entry = readdir(dir)) != NULL) { + if (entry->d_name[0] != '.') { // 跳过 . 和 .. + std::string full_path = config.db_path + "/" + entry->d_name; + std::cout << " - " << entry->d_name; + if (DirectoryExists(full_path)) { + std::cout << " (dir)"; + // 显示子目录内容 + DIR* subdir = opendir(full_path.c_str()); + if (subdir) { + std::cout << " contains: "; + struct dirent* subentry; + int count = 0; + while ((subentry = readdir(subdir)) != NULL && count < 5) { + if (subentry->d_name[0] != '.') { + std::cout << subentry->d_name << " "; + count++; + } + } + if (count == 5) std::cout << "..."; + closedir(subdir); + } + } + std::cout << std::endl; + } + } + closedir(dir); + } + // 1. 如果输入路径本身是dbN格式,直接检测其子目录(无需额外的dbN前缀) std::string db_name_input = ""; bool is_db_dir = false; From fee80360652a76c483b6fc31929d9e7776eedf3b Mon Sep 17 00:00:00 2001 From: chejinge Date: Tue, 30 Dec 2025 19:30:18 +0800 Subject: [PATCH 10/10] feat:add big_key --- tools/bigkey_analyzer/bigkey_analyzer.cc | 61 +++++++++++++++++------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/tools/bigkey_analyzer/bigkey_analyzer.cc b/tools/bigkey_analyzer/bigkey_analyzer.cc index 101191321d..70538a3546 100644 --- a/tools/bigkey_analyzer/bigkey_analyzer.cc +++ b/tools/bigkey_analyzer/bigkey_analyzer.cc @@ -35,7 +35,17 @@ // Utility function to check if a directory exists bool DirectoryExists(const std::string& path) { struct stat st; - return stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode); + bool result = stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode); + std::cout << "Checking directory: " << path << " - " << (result ? "EXISTS" : "NOT FOUND") << std::endl; + return result; +} + +// Utility function to check if a file exists +bool FileExists(const std::string& path) { + struct stat st; + bool result = stat(path.c_str(), &st) == 0 && S_ISREG(st.st_mode); + std::cout << "Checking file: " << path << " - " << (result ? "EXISTS" : "NOT FOUND") << std::endl; + return result; } // Replace special characters for consistent display @@ -723,7 +733,9 @@ int main(int argc, char *argv[]){ // First, check if db_path itself is a valid RocksDB std::string test_path = config.db_path; - if (DirectoryExists(test_path + "/CURRENT")) { + + // Debug info: Does CURRENT file exist? + if (FileExists(test_path + "/CURRENT")) { // This is a single database instance db_paths.push_back(std::make_tuple(test_path, "", "")); std::cout << "Detected single database instance" << std::endl; @@ -782,13 +794,19 @@ int main(int argc, char *argv[]){ // 如果是dbN格式目录,直接检查其下的子目录 if (is_db_dir) { // 检查这个db下的所有分区子目录 + std::cout << "Found dbN directory: " << db_name_input << std::endl; bool found_partitions = false; for (int partition = 0; partition < 1000; partition++) { std::string partition_path = db_dir + "/" + std::to_string(partition); - if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { - db_paths.push_back(std::make_tuple(partition_path, db_name_input, std::to_string(partition))); - found_partitions = true; - } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { + std::cout << "Checking partition path: " << partition_path << std::endl; + if (DirectoryExists(partition_path)) { + std::cout << "Partition directory exists, checking for CURRENT file..." << std::endl; + if (FileExists(partition_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(partition_path, db_name_input, std::to_string(partition))); + found_partitions = true; + std::cout << "Found valid partition: " << partition << std::endl; + } + } else if (partition > 0 && found_partitions) { // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 break; } @@ -812,11 +830,14 @@ int main(int argc, char *argv[]){ bool found_partitions = false; for (int partition = 0; partition < 1000; partition++) { std::string partition_path = db_dir + "/" + std::to_string(partition); - if (DirectoryExists(partition_path) && DirectoryExists(partition_path + "/CURRENT")) { - db_paths.push_back(std::make_tuple(partition_path, db_name, std::to_string(partition))); - found_partitions = true; - found_dbn_format = true; - } else if (partition > 0 && !DirectoryExists(partition_path) && found_partitions) { + if (DirectoryExists(partition_path)) { + if (FileExists(partition_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(partition_path, db_name, std::to_string(partition))); + std::cout << "Found valid dbN/M path: " << db_name << "/" << partition << std::endl; + found_partitions = true; + found_dbn_format = true; + } + } else if (partition > 0 && found_partitions) { // 当前partition不存在且已找到至少一个partition,认为已到达该db的末尾 break; } @@ -836,9 +857,12 @@ int main(int argc, char *argv[]){ if (db_paths.empty()) { for (int db_index = 0; db_index < 1000; db_index++) { std::string db_inst_path = config.db_path + "/" + std::to_string(db_index); - if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { - db_paths.push_back(std::make_tuple(db_inst_path, "", std::to_string(db_index))); - } else if (db_index > 0 && !DirectoryExists(db_inst_path) && !db_paths.empty()) { + if (DirectoryExists(db_inst_path)) { + if (FileExists(db_inst_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(db_inst_path, "", std::to_string(db_index))); + std::cout << "Found direct partition directory: " << db_index << std::endl; + } + } else if (db_index > 0 && !db_paths.empty()) { // 如果目录不存在且已找到至少一个DB,则认为已到达末尾 break; } @@ -850,9 +874,12 @@ int main(int argc, char *argv[]){ int db_index = 0; while (true) { std::string db_inst_path = config.db_path + "/db/" + std::to_string(db_index); - if (DirectoryExists(db_inst_path) && DirectoryExists(db_inst_path + "/CURRENT")) { - db_paths.push_back(std::make_tuple(db_inst_path, "db", std::to_string(db_index))); - db_index++; + if (DirectoryExists(db_inst_path)) { + if (FileExists(db_inst_path + "/CURRENT")) { + db_paths.push_back(std::make_tuple(db_inst_path, "db", std::to_string(db_index))); + std::cout << "Found classic db/N format: " << db_index << std::endl; + db_index++; + } } else { break; }