diff --git a/Makefile b/Makefile index 7c878e45d2b4..e9078c863e17 100644 --- a/Makefile +++ b/Makefile @@ -909,7 +909,7 @@ $(parallel_tests): TEST_SCRIPT=t/run-$$TEST_BINARY-shard-$$SHARD_IDX; \ printf '%s\n' \ '#!/bin/sh' \ - "d=\$(TEST_TMPDIR)$$TEST_SCRIPT" \ + "d=\$(TEST_TMPDIR)/runs/$$TEST_BINARY-shard-$$SHARD_IDX" \ 'mkdir -p $$d' \ "TEST_TMPDIR=\$$d GTEST_TOTAL_SHARDS=$$NUM_SHARDS GTEST_SHARD_INDEX=$$SHARD_IDX $(DRIVER) ./$$TEST_BINARY" \ 'test_retcode=$$?' \ @@ -956,6 +956,8 @@ prioritize_long_running_tests = \ # The default is to run one job per core (J=100%). # See "man parallel" for its "-j ..." option. J ?= 100% +MAKE_CHECK_RUNS_ROOT = $(TEST_TMPDIR)/runs +MAKE_CHECK_DISK_REPORT = $(PYTHON) build_tools/make_check_dbdir_report.py --test_tmpdir="$(TEST_TMPDIR)" --runs_root="$(MAKE_CHECK_RUNS_ROOT)" --top_n=10 # Use this regexp to select the subset of tests whose names match. tests-regexp = . @@ -1051,10 +1053,15 @@ check: all && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ grep -q 'GNU Parallel'; \ then \ - $(MAKE) T="$$t" check_0; \ + $(MAKE) T="$$t" check_0 || { ret=$$?; $(MAKE_CHECK_DISK_REPORT); exit $$ret; }; \ else \ for t in $(TESTS); do \ - echo "===== Running $$t (`date`)"; ./$$t || exit 1; done; \ + d="$(MAKE_CHECK_RUNS_ROOT)/$$t"; \ + mkdir -p "$$d"; \ + echo "===== Running $$t (`date`)"; \ + TEST_TMPDIR="$$d" $(DRIVER) ./$$t || { ret=$$?; $(MAKE_CHECK_DISK_REPORT); exit $$ret; }; \ + rm -rf "$$d"; \ + done; \ fi rm -rf $(TEST_TMPDIR) ifneq ($(PLATFORM), OS_AIX) @@ -1062,6 +1069,7 @@ ifneq ($(PLATFORM), OS_AIX) ifndef ASSERT_STATUS_CHECKED # not yet working with these tests $(PYTHON) tools/ldb_test.py $(PYTHON) tools/db_crashtest_test.py + $(PYTHON) build_tools/make_check_dbdir_report_test.py sh tools/rocksdb_dump_test.sh endif endif diff --git a/build_tools/make_check_dbdir_report.py b/build_tools/make_check_dbdir_report.py new file mode 100644 index 000000000000..4ac623422a57 --- /dev/null +++ b/build_tools/make_check_dbdir_report.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import argparse +import os +import sys + + +DB_MARKER_FILE_NAMES = { + "CURRENT", + "IDENTITY", + "LOCK", + "LOG", +} +DB_MARKER_PREFIXES = ( + "MANIFEST-", + "OPTIONS-", + "LOG.old.", +) +DB_MARKER_SUFFIXES = ( + ".blob", + ".dbtmp", + ".ldb", + ".log", + ".sst", + ".sst.trash", +) + + +def human_readable_bytes(num_bytes): + units = ("B", "KiB", "MiB", "GiB", "TiB", "PiB") + value = float(num_bytes) + unit_index = 0 + while value >= 1024.0 and unit_index + 1 < len(units): + value /= 1024.0 + unit_index += 1 + if unit_index == 0: + return f"{num_bytes}{units[unit_index]}" + return f"{value:.2f}{units[unit_index]}" + + +def file_looks_db_related(filename): + return ( + filename in DB_MARKER_FILE_NAMES + or any(filename.startswith(prefix) for prefix in DB_MARKER_PREFIXES) + or any(filename.endswith(suffix) for suffix in DB_MARKER_SUFFIXES) + ) + + +def format_filesystem_usage(path): + if not hasattr(os, "statvfs"): + return f" {path}: filesystem usage unavailable on this platform" + + try: + stats = os.statvfs(path) + except OSError as exc: + return f" {path}: failed to collect filesystem usage: {exc}" + + block_size = stats.f_frsize or stats.f_bsize + total_bytes = stats.f_blocks * block_size + available_bytes = stats.f_bavail * block_size + used_bytes = max(total_bytes - available_bytes, 0) + used_pct = 0.0 if total_bytes == 0 else 100.0 * used_bytes / total_bytes + return ( + f" {path}: total={human_readable_bytes(total_bytes)} " + f"used={human_readable_bytes(used_bytes)} " + f"avail={human_readable_bytes(available_bytes)} " + f"use={used_pct:.1f}%" + ) + + +def scan_directory(path, ancestor_is_db_dir=False): + total_bytes = 0 + db_entries = [] + errors = [] + child_dirs = [] + has_db_marker = False + + try: + with os.scandir(path) as iterator: + children = sorted(list(iterator), key=lambda entry: entry.name) + except OSError as exc: + return 0, [], [(path, f"failed to enumerate directory contents: {exc}")] + + for child in children: + try: + if child.is_dir(follow_symlinks=False): + child_dirs.append(child.path) + continue + + if not child.is_file(follow_symlinks=False): + continue + + file_size = child.stat(follow_symlinks=False).st_size + except FileNotFoundError: + continue + except OSError as exc: + errors.append((child.path, f"failed to stat child path: {exc}")) + continue + + total_bytes += file_size + if file_looks_db_related(child.name): + has_db_marker = True + + current_is_db_dir = has_db_marker and not ancestor_is_db_dir + for child_dir in child_dirs: + child_total, child_db_entries, child_errors = scan_directory( + child_dir, ancestor_is_db_dir or current_is_db_dir + ) + total_bytes += child_total + db_entries.extend(child_db_entries) + errors.extend(child_errors) + + if current_is_db_dir: + db_entries.append({"path": path, "bytes": total_bytes}) + + return total_bytes, db_entries, errors + + +def collect_run_diagnostics(runs_root): + run_entries = [] + db_entries = [] + errors = [] + + try: + with os.scandir(runs_root) as iterator: + run_dirs = sorted( + [entry for entry in iterator if entry.is_dir(follow_symlinks=False)], + key=lambda entry: entry.name, + ) + except FileNotFoundError: + return {"run_entries": [], "db_entries": [], "errors": []} + except OSError as exc: + return { + "run_entries": [], + "db_entries": [], + "errors": [(runs_root, f"failed to enumerate test runs: {exc}")], + } + + for run_dir in run_dirs: + run_bytes, run_db_entries, run_errors = scan_directory(run_dir.path) + run_label = os.path.relpath(run_dir.path, runs_root) + run_entries.append( + { + "name": run_label, + "path": run_dir.path, + "bytes": run_bytes, + } + ) + for db_entry in run_db_entries: + relpath = os.path.relpath(db_entry["path"], run_dir.path) + if relpath == ".": + relpath = "" + db_entries.append( + { + "run_name": run_label, + "path": db_entry["path"], + "relpath": relpath, + "bytes": db_entry["bytes"], + } + ) + errors.extend(run_errors) + + run_entries.sort(key=lambda entry: (-entry["bytes"], entry["name"])) + db_entries.sort( + key=lambda entry: (-entry["bytes"], entry["run_name"], entry["relpath"]) + ) + return { + "run_entries": run_entries, + "db_entries": db_entries, + "errors": errors, + } + + +def build_make_check_disk_report( + runs_root, test_tmpdir=None, top_n=10, include_dev_shm=True +): + diagnostics = collect_run_diagnostics(runs_root) + lines = ["=== make check disk usage diagnostics ===", "Filesystem usage:"] + + printed_paths = set() + if include_dev_shm and os.path.isdir("/dev/shm"): + lines.append(format_filesystem_usage("/dev/shm")) + printed_paths.add(os.path.normpath("/dev/shm")) + + for path in [test_tmpdir, runs_root]: + if not path: + continue + normalized = os.path.normpath(path) + if normalized in printed_paths or not os.path.isdir(normalized): + continue + lines.append(format_filesystem_usage(normalized)) + printed_paths.add(normalized) + + lines.append(f"Runs root: {runs_root}") + + run_entries = diagnostics["run_entries"] + db_entries = diagnostics["db_entries"] + + lines.append(f"Top {min(top_n, len(run_entries))} test tmpdirs by size:") + if not run_entries: + lines.append(" no per-test temp directories found") + else: + for index, entry in enumerate(run_entries[:top_n], start=1): + lines.append( + " {}. {} {} path={}".format( + index, + human_readable_bytes(entry["bytes"]), + entry["name"], + entry["path"], + ) + ) + + lines.append(f"Top {min(top_n, len(db_entries))} DB-like directories by size:") + if not db_entries: + lines.append(" no DB-like directories found") + else: + for index, entry in enumerate(db_entries[:top_n], start=1): + lines.append( + " {}. {} owner={} db_dir={} path={}".format( + index, + human_readable_bytes(entry["bytes"]), + entry["run_name"], + entry["relpath"], + entry["path"], + ) + ) + + if diagnostics["errors"]: + lines.append("Collection errors:") + for path, error in diagnostics["errors"]: + lines.append(f" {path}: {error}") + + return "\n".join(lines) + "\n" + + +def main(): + parser = argparse.ArgumentParser( + description="Print disk-usage diagnostics for make check temp directories." + ) + parser.add_argument("--runs_root", required=True) + parser.add_argument("--test_tmpdir") + parser.add_argument("--top_n", type=int, default=10) + parser.add_argument("--no_dev_shm", action="store_true", default=False) + args = parser.parse_args() + + report = build_make_check_disk_report( + args.runs_root, + args.test_tmpdir, + args.top_n, + not args.no_dev_shm, + ) + sys.stdout.write(report) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/build_tools/make_check_dbdir_report_test.py b/build_tools/make_check_dbdir_report_test.py new file mode 100644 index 000000000000..4001dd0732f9 --- /dev/null +++ b/build_tools/make_check_dbdir_report_test.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import importlib.util +import os +import shutil +import sys +import tempfile +import unittest + + +_REPORT_PATH = os.path.join( + os.path.dirname(__file__), "make_check_dbdir_report.py" +) + + +def load_report_module(): + spec = importlib.util.spec_from_file_location( + "make_check_dbdir_report_under_test", _REPORT_PATH + ) + module = importlib.util.module_from_spec(spec) + old_argv = sys.argv[:] + try: + sys.argv = [_REPORT_PATH] + spec.loader.exec_module(module) + finally: + sys.argv = old_argv + return module + + +class MakeCheckDbdirReportTest(unittest.TestCase): + def setUp(self): + self.test_tmpdir = tempfile.mkdtemp(prefix="make_check_report_test_") + self.runs_root = os.path.join(self.test_tmpdir, "runs") + os.makedirs(self.runs_root) + + def tearDown(self): + shutil.rmtree(self.test_tmpdir) + + def write_file(self, path, size): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + f.write(b"x" * size) + + def test_collect_run_diagnostics_sorts_runs_and_db_dirs_by_size(self): + report = load_report_module() + + run_a = os.path.join(self.runs_root, "db_test-shard-0") + run_b = os.path.join(self.runs_root, "table_test-shard-1") + self.write_file(os.path.join(run_a, "case_a", "CURRENT"), 1) + self.write_file(os.path.join(run_a, "case_a", "000001.sst"), 4) + self.write_file(os.path.join(run_b, "nested", "case_b", "CURRENT"), 1) + self.write_file(os.path.join(run_b, "nested", "case_b", "000002.sst.trash"), 8) + + diagnostics = report.collect_run_diagnostics(self.runs_root) + + self.assertEqual( + ["table_test-shard-1", "db_test-shard-0"], + [entry["name"] for entry in diagnostics["run_entries"]], + ) + self.assertEqual( + ["table_test-shard-1", "db_test-shard-0"], + [entry["run_name"] for entry in diagnostics["db_entries"]], + ) + self.assertEqual("nested/case_b", diagnostics["db_entries"][0]["relpath"]) + self.assertEqual("case_a", diagnostics["db_entries"][1]["relpath"]) + + def test_build_make_check_disk_report_formats_top_entries(self): + report = load_report_module() + + run_dir = os.path.join(self.runs_root, "db_test-shard-0") + self.write_file(os.path.join(run_dir, "db_case", "CURRENT"), 1) + self.write_file(os.path.join(run_dir, "db_case", "MANIFEST-000001"), 2) + self.write_file(os.path.join(run_dir, "db_case", "000001.sst"), 3) + + output = report.build_make_check_disk_report( + self.runs_root, + self.test_tmpdir, + top_n=10, + include_dev_shm=False, + ) + + self.assertIn("=== make check disk usage diagnostics ===", output) + self.assertIn(f"Runs root: {self.runs_root}", output) + self.assertIn( + f"1. 6B db_test-shard-0 path={run_dir}", + output, + ) + self.assertIn( + f"1. 6B owner=db_test-shard-0 db_dir=db_case path={os.path.join(run_dir, 'db_case')}", + output, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test_util/testharness.cc b/test_util/testharness.cc index 8b8a43bf1d42..0852401340cd 100644 --- a/test_util/testharness.cc +++ b/test_util/testharness.cc @@ -9,55 +9,149 @@ #include "test_util/testharness.h" +#include #include +#include #include #include #include +#include +#include #ifndef OS_WIN #include #endif +#include "file/file_util.h" #ifndef NDEBUG #include "test_util/sync_point.h" #endif namespace { +std::mutex& RegisteredPerTestPathsMutex() { + static std::mutex mutex; + return mutex; +} + +std::unordered_set& RegisteredPerTestPaths() { + static auto* paths = new std::unordered_set(); + return *paths; +} + +void RegisterPerTestPath(std::string path) { + std::lock_guard lock(RegisteredPerTestPathsMutex()); + RegisteredPerTestPaths().insert(std::move(path)); +} + +void ClearRegisteredPerTestPathsImpl() { + std::lock_guard lock(RegisteredPerTestPathsMutex()); + RegisteredPerTestPaths().clear(); +} + +ROCKSDB_NAMESPACE::Status CleanupRegisteredPerTestPathsImpl() { + std::vector paths; + { + std::lock_guard lock(RegisteredPerTestPathsMutex()); + paths.assign(RegisteredPerTestPaths().begin(), + RegisteredPerTestPaths().end()); + RegisteredPerTestPaths().clear(); + } + + std::sort(paths.begin(), paths.end(), + [](const std::string& lhs, const std::string& rhs) { + if (lhs.size() != rhs.size()) { + return lhs.size() > rhs.size(); + } + return lhs < rhs; + }); + + ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default(); + for (const auto& path : paths) { + if (path.empty()) { + continue; + } + + ROCKSDB_NAMESPACE::Status exists = env->FileExists(path); + if (exists.IsNotFound()) { + continue; + } + if (!exists.ok()) { + return ROCKSDB_NAMESPACE::Status::IOError( + "Failed to stat registered test path " + path + ": " + + exists.ToString()); + } + + bool is_dir = false; + ROCKSDB_NAMESPACE::Status s = env->IsDirectory(path, &is_dir); + if (s.ok()) { + s = is_dir ? ROCKSDB_NAMESPACE::DestroyDir(env, path) + : env->DeleteFile(path); + } + if (!s.ok() && !s.IsNotFound()) { + return ROCKSDB_NAMESPACE::Status::IOError( + "Failed to clean up registered test path " + path + ": " + + s.ToString()); + } + } + return ROCKSDB_NAMESPACE::Status::OK(); +} + +void CleanupSyncPointState() { #ifndef NDEBUG -// Global gtest event listener that cleans up SyncPoint state after every -// test. Many tests set SyncPoint callbacks with captured local variables -// but forget to disable/clear them. Under sharded execution (multiple -// tests per process), stale callbacks cause segfaults or corruption. -class SyncPointCleanupListener : public ::testing::EmptyTestEventListener { - void OnTestEnd(const ::testing::TestInfo& /*test_info*/) override { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); - // LoadDependency({}) clears successors_, predecessors_, and - // cleared_points_ maps. Without this, stale dependencies from a - // previous test can block SyncPoint::Process() in the next test - // (e.g. a background compaction thread hitting CompactFilesImpl:1 - // whose predecessor was never fired). - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + // LoadDependency({}) clears successors_, predecessors_, and + // cleared_points_ maps. Without this, stale dependencies from a previous + // test can block SyncPoint::Process() in the next test. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); +#endif // !NDEBUG +} + +// Global gtest event listener that cleans up shared per-test state after every +// test. Under sharded execution, multiple tests now share one process, so any +// leaked SyncPoint callbacks or leftover PerThreadDBPath() directories can +// accumulate across thousands of test cases. +class TestStateCleanupListener : public ::testing::EmptyTestEventListener { + void OnTestStart(const ::testing::TestInfo& /*test_info*/) override { + ClearRegisteredPerTestPathsImpl(); + } + + void OnTestEnd(const ::testing::TestInfo& test_info) override { + CleanupSyncPointState(); + if (getenv("KEEP_DB") == nullptr && test_info.result()->Passed()) { + EXPECT_OK(CleanupRegisteredPerTestPathsImpl()); + } else { + ClearRegisteredPerTestPathsImpl(); + } } }; // Auto-register the listener via static initialization. // This runs before main() and before any test fixtures are constructed. -static int RegisterSyncPointCleanup() noexcept { +static int RegisterTestStateCleanup() noexcept { ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); - listeners.Append(new SyncPointCleanupListener()); + listeners.Append(new TestStateCleanupListener()); return 0; } // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -[[maybe_unused]] static int sync_point_cleanup_registered_ = - RegisterSyncPointCleanup(); -#endif // !NDEBUG +[[maybe_unused]] static int test_state_cleanup_registered_ = + RegisterTestStateCleanup(); } // namespace namespace ROCKSDB_NAMESPACE::test { +namespace detail { + +void ClearRegisteredPerTestPaths() { ClearRegisteredPerTestPathsImpl(); } + +Status CleanupRegisteredPerTestPaths() { + return CleanupRegisteredPerTestPathsImpl(); +} + +} // namespace detail + #ifdef OS_WIN #include @@ -83,7 +177,10 @@ std::string TmpDir(Env* env) { std::string PerThreadDBPath(std::string dir, std::string name) { size_t tid = std::hash()(std::this_thread::get_id()); - return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid); + std::string path = + dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid); + RegisterPerTestPath(path); + return path; } std::string PerThreadDBPath(std::string name) { diff --git a/test_util/testharness.h b/test_util/testharness.h index 9379225372ec..1075ef71c6e7 100644 --- a/test_util/testharness.h +++ b/test_util/testharness.h @@ -72,6 +72,14 @@ std::string PerThreadDBPath(std::string name); std::string PerThreadDBPath(Env* env, std::string name); std::string PerThreadDBPath(std::string dir, std::string name); +namespace detail { + +// Test-infra hooks used to clean up paths registered via PerThreadDBPath(). +void ClearRegisteredPerTestPaths(); +Status CleanupRegisteredPerTestPaths(); + +} // namespace detail + // Return a randomization seed for this run. Typically returns the // same number on repeated invocations of this binary, but automated // runs may be able to vary the seed. diff --git a/test_util/testutil_test.cc b/test_util/testutil_test.cc index 41f26e3890db..819d35c7d1de 100644 --- a/test_util/testutil_test.cc +++ b/test_util/testutil_test.cc @@ -34,6 +34,31 @@ TEST(TestUtil, DestroyDirRecursively) { ASSERT_TRUE(s.IsNotFound()); } +TEST(TestUtil, CleanupRegisteredPerTestPathsRemovesDirectory) { + auto env = Env::Default(); + test::detail::ClearRegisteredPerTestPaths(); + + std::string test_dir = test::PerThreadDBPath("test_util_cleanup_dir"); + DestroyDir(env, test_dir).PermitUncheckedError(); + ASSERT_OK(env->CreateDirIfMissing(test_dir)); + CreateFile(env, test_dir + "/file"); + + ASSERT_OK(test::detail::CleanupRegisteredPerTestPaths()); + ASSERT_TRUE(env->FileExists(test_dir).IsNotFound()); +} + +TEST(TestUtil, CleanupRegisteredPerTestPathsRemovesFile) { + auto env = Env::Default(); + test::detail::ClearRegisteredPerTestPaths(); + + std::string test_file = test::PerThreadDBPath("test_util_cleanup_file"); + env->DeleteFile(test_file).PermitUncheckedError(); + CreateFile(env, test_file); + + ASSERT_OK(test::detail::CleanupRegisteredPerTestPaths()); + ASSERT_TRUE(env->FileExists(test_file).IsNotFound()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) {