diff --git a/db/builder.cc b/db/builder.cc index 36c80e144981..40b4c996202f 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -467,7 +467,9 @@ Status BuildTable( MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, - /*allow_unprepared_value*/ false)); + /*allow_unprepared_value*/ false, + /*range_del_read_seqno=*/nullptr, /*range_del_iter=*/nullptr, + /*maybe_pin_table_handle=*/false, &meta->file_open_metadata)); s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 43c4217d88a2..00f5f6fff4d0 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -2133,6 +2133,18 @@ Status CompactionJob::FinishCompactionOutputFile( meta = nullptr; } + if (s.ok() && meta != nullptr && db_options_.fast_sst_open) { + FileOptions metadata_file_options = file_options_; + metadata_file_options.temperature = meta->temperature; + metadata_file_options.file_checksum = meta->file_checksum; + metadata_file_options.file_checksum_func_name = + meta->file_checksum_func_name; + IOStatus metadata_io_s = MaybeGetFileOpenMetadata( + cfd->ioptions().fs.get(), GetTableFileName(meta->fd.GetNumber()), + metadata_file_options, &meta->file_open_metadata); + metadata_io_s.PermitUncheckedError(); + } + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { // Output to event logger and fire events. outputs.UpdateTableProperties(); @@ -2862,6 +2874,11 @@ Status CompactionJob::ReadTablePropertiesDirectly( FileOptions fopts = file_options_; fopts.file_checksum = file_meta->file_checksum; fopts.file_checksum_func_name = file_meta->file_checksum_func_name; + std::string file_open_metadata; + if (ioptions.fast_sst_open && !file_meta->file_open_metadata.empty()) { + file_open_metadata = file_meta->file_open_metadata; + fopts.file_metadata = &file_open_metadata; + } Status s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, nullptr /* dbg */); if (!s.ok()) { diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 21486f89538e..dab14ad48d14 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -617,6 +617,7 @@ struct CompactionServiceOutputFile { bool marked_for_compaction; UniqueId64x2 unique_id{}; TableProperties table_properties; + std::string file_open_metadata; bool is_proximal_level_output; Temperature file_temperature = Temperature::kUnknown; @@ -629,7 +630,8 @@ struct CompactionServiceOutputFile { const std::string& _file_checksum, const std::string& _file_checksum_func_name, uint64_t _paranoid_hash, bool _marked_for_compaction, UniqueId64x2 _unique_id, - const TableProperties& _table_properties, bool _is_proximal_level_output, + const TableProperties& _table_properties, + std::string _file_open_metadata, bool _is_proximal_level_output, Temperature _file_temperature) : file_name(name), file_size(size), @@ -646,6 +648,7 @@ struct CompactionServiceOutputFile { marked_for_compaction(_marked_for_compaction), unique_id(std::move(_unique_id)), table_properties(_table_properties), + file_open_metadata(std::move(_file_open_metadata)), is_proximal_level_output(_is_proximal_level_output), file_temperature(_file_temperature) {} }; diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 7a6f77ee222a..81e0640b14a6 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -1686,6 +1686,7 @@ TEST_F(CompactionJobTest, ResultSerialization) { file_checksum_func_name /* file_checksum_func_name */, rnd64.Uniform(UINT64_MAX) /* paranoid_hash */, rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp, + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)) /* file_open_metadata */, false /* is_proximal_level_output */, Temperature::kHot); } result.output_level = rnd.Uniform(10); diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index cb88c53d8f8d..e0e3794e865c 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -258,6 +258,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.file_checksum_func_name = file.file_checksum_func_name; meta.marked_for_compaction = file.marked_for_compaction; meta.unique_id = file.unique_id; + meta.file_open_metadata = file.file_open_metadata; meta.temperature = file.file_temperature; meta.tail_size = FileMetaData::CalculateTailSize(file_size, file.table_properties); @@ -445,6 +446,7 @@ Status CompactionServiceCompactionJob::Run() { meta.file_checksum, meta.file_checksum_func_name, output_file.validator.GetHash(), meta.marked_for_compaction, meta.unique_id, *output_file.table_properties, + meta.file_open_metadata, output_file.is_proximal_level, meta.temperature); } } @@ -622,6 +624,10 @@ static std::unordered_map const auto that_one = static_cast(addr2); return this_one->AreEqual(opts, that_one, mismatch); }}}, + {"file_open_metadata", + {offsetof(struct CompactionServiceOutputFile, file_open_metadata), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"is_proximal_level_output", {offsetof(struct CompactionServiceOutputFile, is_proximal_level_output), diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index a04863a2f527..1ac04fb150c6 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -88,6 +88,15 @@ class DBBasicTest : public DBTestBase { DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {} }; +class CloseDBOnExit { + public: + explicit CloseDBOnExit(DBTestBase* test) : test_(test) {} + ~CloseDBOnExit() { test_->Close(); } + + private: + DBTestBase* test_; +}; + TEST_F(DBBasicTest, OpenWhenOpen) { Options options = CurrentOptions(); options.env = env_; @@ -5782,7 +5791,24 @@ INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline, std::make_tuple(true, true))); // FileSystemWrapper that captures FileOptions passed to NewRandomAccessFile -// for .sst files, so we can verify file_checksum fields are populated. +// for .sst files, so we can verify file_checksum and file_metadata fields are +// populated. +class MetadataAwareRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + MetadataAwareRandomAccessFile(std::unique_ptr&& target, + std::string metadata) + : FSRandomAccessFileOwnerWrapper(std::move(target)), + metadata_(std::move(metadata)) {} + + IOStatus GetFileOpenMetadata(std::string* metadata) override { + *metadata = metadata_; + return IOStatus::OK(); + } + + private: + std::string metadata_; +}; + class ChecksumCapturingFS : public FileSystemWrapper { public: explicit ChecksumCapturingFS(const std::shared_ptr& base) @@ -5799,9 +5825,17 @@ class ChecksumCapturingFS : public FileSystemWrapper { std::lock_guard lock(mu_); captured_file_checksum_ = opts.file_checksum; captured_file_checksum_func_name_ = opts.file_checksum_func_name; + captured_has_file_metadata_ = opts.file_metadata != nullptr; + captured_file_metadata_ = + opts.file_metadata != nullptr ? *opts.file_metadata : ""; capture_count_++; } - return target()->NewRandomAccessFile(fname, opts, result, dbg); + IOStatus io_s = target()->NewRandomAccessFile(fname, opts, result, dbg); + if (io_s.ok() && fname.find(".sst") != std::string::npos) { + *result = std::make_unique( + std::move(*result), "fast-open-metadata:" + fname); + } + return io_s; } std::string GetCapturedFileChecksum() { @@ -5819,10 +5853,22 @@ class ChecksumCapturingFS : public FileSystemWrapper { return capture_count_; } + bool HasCapturedFileMetadata() { + std::lock_guard lock(mu_); + return captured_has_file_metadata_; + } + + std::string GetCapturedFileMetadata() { + std::lock_guard lock(mu_); + return captured_file_metadata_; + } + void Reset() { std::lock_guard lock(mu_); captured_file_checksum_.clear(); captured_file_checksum_func_name_.clear(); + captured_file_metadata_.clear(); + captured_has_file_metadata_ = false; capture_count_ = 0; } @@ -5830,9 +5876,259 @@ class ChecksumCapturingFS : public FileSystemWrapper { std::mutex mu_; std::string captured_file_checksum_; std::string captured_file_checksum_func_name_; + std::string captured_file_metadata_; + bool captured_has_file_metadata_ = false; int capture_count_ = 0; }; +class FileOpenMetadataCapturingFS : public FileSystemWrapper { + public: + explicit FileOpenMetadataCapturingFS(const std::shared_ptr& base, + bool metadata_supported = true) + : FileSystemWrapper(base), metadata_supported_(metadata_supported) {} + + static const char* kClassName() { return "FileOpenMetadataCapturingFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + if (fname.find(".sst") != std::string::npos) { + std::lock_guard lock(mu_); + last_open_metadata_ = + opts.file_metadata != nullptr ? *opts.file_metadata : ""; + if (!last_open_metadata_.empty()) { + ++open_with_metadata_count_; + } + } + + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + if (!s.ok()) { + return s; + } + if (fname.find(".sst") == std::string::npos) { + *result = std::move(file); + return s; + } + result->reset(new MetadataRandomAccessFile( + std::move(file), MetadataForFile(fname), metadata_supported_, this)); + return s; + } + + std::string GetLastGeneratedMetadata() { + std::lock_guard lock(mu_); + return last_generated_metadata_; + } + + std::string GetLastOpenMetadata() { + std::lock_guard lock(mu_); + return last_open_metadata_; + } + + int GetMetadataRequestCount() { + std::lock_guard lock(mu_); + return metadata_request_count_; + } + + int GetOpenWithMetadataCount() { + std::lock_guard lock(mu_); + return open_with_metadata_count_; + } + + void Reset() { + std::lock_guard lock(mu_); + last_generated_metadata_.clear(); + last_open_metadata_.clear(); + metadata_request_count_ = 0; + open_with_metadata_count_ = 0; + } + + private: + class MetadataRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + MetadataRandomAccessFile(std::unique_ptr&& base, + std::string metadata, bool metadata_supported, + FileOpenMetadataCapturingFS* owner) + : FSRandomAccessFileOwnerWrapper(std::move(base)), + metadata_(std::move(metadata)), + metadata_supported_(metadata_supported), + owner_(owner) {} + + IOStatus GetFileOpenMetadata(std::string* metadata) override { + owner_->RecordMetadataRequest(metadata_); + if (!metadata_supported_) { + return IOStatus::NotSupported("GetFileOpenMetadata not supported"); + } + *metadata = metadata_; + return IOStatus::OK(); + } + + private: + std::string metadata_; + bool metadata_supported_; + FileOpenMetadataCapturingFS* owner_; + }; + + static std::string MetadataForFile(const std::string& fname) { + return "file-open-metadata:" + fname; + } + + void RecordMetadataRequest(const std::string& metadata) { + std::lock_guard lock(mu_); + last_generated_metadata_ = metadata; + ++metadata_request_count_; + } + + std::mutex mu_; + bool metadata_supported_; + std::string last_generated_metadata_; + std::string last_open_metadata_; + int metadata_request_count_ = 0; + int open_with_metadata_count_ = 0; +}; + +TEST_F(DBBasicTest, FastSstOpenFlushPersistsAndReusesMetadata) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CloseDBOnExit close_db(this); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + ASSERT_GT(capturing_fs->GetMetadataRequestCount(), 0); + std::string expected_metadata = capturing_fs->GetLastGeneratedMetadata(); + ASSERT_FALSE(expected_metadata.empty()); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("value1", Get("key1")); + ASSERT_GT(capturing_fs->GetOpenWithMetadataCount(), 0); + ASSERT_EQ(capturing_fs->GetLastOpenMetadata(), expected_metadata); + +} + +TEST_F(DBBasicTest, FastSstOpenCompactionPersistsAndReusesMetadata) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CloseDBOnExit close_db(this); + + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("b", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "2")); + ASSERT_OK(Put("c", "2")); + ASSERT_OK(Flush()); + + capturing_fs->Reset(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + EXPECT_GT(capturing_fs->GetMetadataRequestCount(), 0); + std::string expected_metadata = capturing_fs->GetLastGeneratedMetadata(); + EXPECT_FALSE(expected_metadata.empty()); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("2", Get("a")); + EXPECT_GT(capturing_fs->GetOpenWithMetadataCount(), 0); + EXPECT_EQ(capturing_fs->GetLastOpenMetadata(), expected_metadata); +} + +TEST_F(DBBasicTest, FastSstOpenCanBeDisabledOnReopen) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + DestroyAndReopen(options); + CloseDBOnExit close_db(this); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + ASSERT_GT(capturing_fs->GetMetadataRequestCount(), 0); + + options.fast_sst_open = false; + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("value1", Get("key1")); + ASSERT_EQ(capturing_fs->GetOpenWithMetadataCount(), 0); + ASSERT_TRUE(capturing_fs->GetLastOpenMetadata().empty()); + +} + +TEST_F(DBBasicTest, FastSstOpenHandlesUnsupportedMetadataGracefully) { + auto capturing_fs = std::make_shared( + env_->GetFileSystem(), /*metadata_supported=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + DestroyAndReopen(options); + CloseDBOnExit close_db(this); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + ASSERT_GT(capturing_fs->GetMetadataRequestCount(), 0); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("value1", Get("key1")); + ASSERT_EQ(capturing_fs->GetOpenWithMetadataCount(), 0); + +} + +TEST_F(DBBasicTest, FastSstOpenPersistsMetadataForIngestedFiles) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + DestroyAndReopen(options); + CloseDBOnExit close_db(this); + + const std::string external_sst = dbname_ + "/external.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(external_sst)); + ASSERT_OK(sst_file_writer.Put("ingest-key", "ingest-value")); + ASSERT_OK(sst_file_writer.Finish()); + + capturing_fs->Reset(); + ASSERT_OK(db_->IngestExternalFile({external_sst}, IngestExternalFileOptions())); + ASSERT_GT(capturing_fs->GetMetadataRequestCount(), 0); + std::string expected_metadata = capturing_fs->GetLastGeneratedMetadata(); + ASSERT_FALSE(expected_metadata.empty()); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("ingest-value", Get("ingest-key")); + ASSERT_GT(capturing_fs->GetOpenWithMetadataCount(), 0); + ASSERT_EQ(capturing_fs->GetLastOpenMetadata(), expected_metadata); + +} + TEST_F(DBBasicTest, FileChecksumInFileOptions) { // Verify that file_checksum and file_checksum_func_name from FileMetaData // are propagated through FileOptions when opening SST files. @@ -5868,6 +6164,58 @@ TEST_F(DBBasicTest, FileChecksumInFileOptions) { Close(); } +TEST_F(DBBasicTest, FastSstOpenInFileOptions) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("value1", Get("key1")); + + ASSERT_GT(capturing_fs->GetCaptureCount(), 0); + ASSERT_TRUE(capturing_fs->HasCapturedFileMetadata()); + ASSERT_FALSE(capturing_fs->GetCapturedFileMetadata().empty()); + ASSERT_TRUE(capturing_fs->GetCapturedFileMetadata().find( + "fast-open-metadata:") == 0); + + Close(); +} + +TEST_F(DBBasicTest, FastSstOpenDisabledDoesNotPassFileMetadata) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = false; + DestroyAndReopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + + capturing_fs->Reset(); + Reopen(options); + ASSERT_EQ("value1", Get("key1")); + + ASSERT_GT(capturing_fs->GetCaptureCount(), 0); + ASSERT_FALSE(capturing_fs->HasCapturedFileMetadata()); + ASSERT_TRUE(capturing_fs->GetCapturedFileMetadata().empty()); + + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 4f152f7b1bcf..9882f7fb9d11 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -6985,13 +6985,19 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, std::string fname = TableFileName(cfd->ioptions().cf_paths, fd.GetNumber(), fd.GetPathId()); if (use_file_checksum) { - s = VerifyFullFileChecksum(fmeta->file_checksum, - fmeta->file_checksum_func_name, fname, - read_options); + s = VerifyFullFileChecksum( + fmeta->file_checksum, fmeta->file_checksum_func_name, fname, + read_options, &fmeta->file_open_metadata); } else { FileOptions fopts = file_options_; fopts.file_checksum = fmeta->file_checksum; fopts.file_checksum_func_name = fmeta->file_checksum_func_name; + std::string file_open_metadata; + if (immutable_db_options_.fast_sst_open && + !fmeta->file_open_metadata.empty()) { + file_open_metadata = fmeta->file_open_metadata; + fopts.file_metadata = &file_open_metadata; + } s = ROCKSDB_NAMESPACE::VerifySstFileChecksumInternal( opts, fopts, read_options, fname, fd.largest_seqno); } @@ -7054,7 +7060,8 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, const std::string& func_name_expected, const std::string& fname, - const ReadOptions& read_options) { + const ReadOptions& read_options, + const std::string* file_open_metadata) { Status s; if (file_checksum_expected == kUnknownFileChecksum) { return s; @@ -7064,6 +7071,12 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, FileOptions fopts; fopts.file_checksum = file_checksum_expected; fopts.file_checksum_func_name = func_name_expected; + std::string copied_file_open_metadata; + if (immutable_db_options_.fast_sst_open && file_open_metadata != nullptr && + !file_open_metadata->empty()) { + copied_file_open_metadata = *file_open_metadata; + fopts.file_metadata = &copied_file_open_metadata; + } s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), func_name_expected, &file_checksum, &func_name, diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index ad2094c17d2e..443e5bfa9a8f 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -636,7 +636,9 @@ class DBImpl : public DB { Status VerifyFullFileChecksum(const std::string& file_checksum_expected, const std::string& func_name_expected, const std::string& fpath, - const ReadOptions& read_options); + const ReadOptions& read_options, + const std::string* file_open_metadata = + nullptr); using DB::StartTrace; Status StartTrace(const TraceOptions& options, diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 87474965228a..df00dcd84a09 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1587,7 +1587,8 @@ Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, f->user_defined_timestamps_persisted, - f->min_timestamp, f->max_timestamp); + f->min_timestamp, f->max_timestamp, + f->file_open_metadata); moved_bytes += static_cast(c.input(l, i)->fd.GetFileSize()); ROCKS_LOG_BUFFER( log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", @@ -2120,7 +2121,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, f->user_defined_timestamps_persisted, f->min_timestamp, - f->max_timestamp); + f->max_timestamp, f->file_open_metadata); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -4432,6 +4433,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, in_file->min_timestamp, in_file->max_timestamp}; + if (immutable_db_options_.fast_sst_open) { + FileOptions metadata_file_options = copied_file_options; + metadata_file_options.file_checksum = out_file_metadata.file_checksum; + metadata_file_options.file_checksum_func_name = + out_file_metadata.file_checksum_func_name; + IOStatus metadata_io_s = MaybeGetFileOpenMetadata( + immutable_db_options_.fs.get(), out_fname, metadata_file_options, + &out_file_metadata.file_open_metadata); + metadata_io_s.PermitUncheckedError(); + } + out_files.push_back(std::move(out_file_metadata)); } diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index bb6a9a2e409c..1829a53c6f9e 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -145,7 +145,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, f->user_defined_timestamps_persisted, f->min_timestamp, - f->max_timestamp); + f->max_timestamp, f->file_open_metadata); } status = versions_->LogAndApply(cfd, read_options, write_options, &edit, diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 1d1eb9b5c423..a8f3852559ac 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -660,7 +660,8 @@ Status DBImpl::Recover( f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, f->user_defined_timestamps_persisted, - f->min_timestamp, f->max_timestamp); + f->min_timestamp, f->max_timestamp, + f->file_open_metadata); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -2182,7 +2183,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.file_creation_time, meta.epoch_number, meta.file_checksum, meta.file_checksum_func_name, meta.unique_id, meta.compensated_range_deletion_size, - meta.tail_size, meta.user_defined_timestamps_persisted); + meta.tail_size, meta.user_defined_timestamps_persisted, "", + "", meta.file_open_metadata); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/experimental.cc b/db/experimental.cc index b6efc1a47534..8fdb3f97b552 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -168,7 +168,8 @@ Status UpdateManifestForFilesState( lf->file_checksum_func_name, lf->unique_id, lf->compensated_range_deletion_size, lf->tail_size, lf->user_defined_timestamps_persisted, - lf->min_timestamp, lf->max_timestamp); + lf->min_timestamp, lf->max_timestamp, + lf->file_open_metadata); } } } else { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 326b3d567a09..4c2cc4ed9ef9 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -22,6 +22,89 @@ namespace ROCKSDB_NAMESPACE { +class CloseDBOnExit { + public: + explicit CloseDBOnExit(DBTestBase* test) : test_(test) {} + ~CloseDBOnExit() { test_->Close(); } + + private: + DBTestBase* test_; +}; + +class MetadataAwareRandomAccessFile : public FSRandomAccessFileWrapper { + public: + MetadataAwareRandomAccessFile(std::unique_ptr target, + std::string metadata) + : FSRandomAccessFileWrapper(target.get()), + guard_(std::move(target)), + metadata_(std::move(metadata)) {} + + IOStatus GetFileOpenMetadata(std::string* metadata) override { + *metadata = metadata_; + return IOStatus::OK(); + } + + private: + std::unique_ptr guard_; + std::string metadata_; +}; + +class FileOpenMetadataCapturingFS : public FileSystemWrapper { + public: + explicit FileOpenMetadataCapturingFS(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "FileOpenMetadataCapturingFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + if (fname.find(".sst") != std::string::npos) { + std::lock_guard lock(mu_); + captured_has_file_metadata_ = opts.file_metadata != nullptr; + captured_file_metadata_ = + opts.file_metadata != nullptr ? *opts.file_metadata : ""; + capture_count_++; + } + IOStatus io_s = target()->NewRandomAccessFile(fname, opts, result, dbg); + if (io_s.ok() && fname.find(".sst") != std::string::npos) { + *result = std::make_unique( + std::move(*result), "fast-open-metadata:" + fname); + } + return io_s; + } + + void Reset() { + std::lock_guard lock(mu_); + captured_has_file_metadata_ = false; + captured_file_metadata_.clear(); + capture_count_ = 0; + } + + bool HasCapturedFileMetadata() { + std::lock_guard lock(mu_); + return captured_has_file_metadata_; + } + + std::string GetCapturedFileMetadata() { + std::lock_guard lock(mu_); + return captured_file_metadata_; + } + + int GetCaptureCount() { + std::lock_guard lock(mu_); + return capture_count_; + } + + private: + std::mutex mu_; + bool captured_has_file_metadata_ = false; + std::string captured_file_metadata_; + int capture_count_ = 0; +}; + class ExternalSSTFileBasicTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -264,6 +347,37 @@ TEST_F(ExternalSSTFileBasicTest, Basic) { DestroyAndRecreateExternalSSTFilesDir(); } +TEST_F(ExternalSSTFileBasicTest, FastSstOpenAfterExternalIngestion) { + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.fast_sst_open = true; + Reopen(options); + CloseDBOnExit close_db(this); + + std::map true_data; + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2, 3}, kTypeValue, + /*file_id=*/1, + /*write_global_seqno=*/true, + /*verify_checksums_before_ingest=*/false, + &true_data)); + + capturing_fs->Reset(); + Reopen(options); + + ASSERT_EQ(Key(1) + "1", Get(Key(1))); + ASSERT_GT(capturing_fs->GetCaptureCount(), 0); + ASSERT_TRUE(capturing_fs->HasCapturedFileMetadata()); + ASSERT_TRUE(capturing_fs->GetCapturedFileMetadata().find( + "fast-open-metadata:") == 0); + + Close(); +} + TEST_F(ExternalSSTFileBasicTest, AlignedBufferedWrite) { class AlignedWriteFS : public FileSystemWrapper { public: diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 7a379b9df790..8153b111eb3b 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -676,6 +676,17 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( status.ToString().c_str()); return status; } + if (db_options_.fast_sst_open) { + FileOptions metadata_file_options{env_options_}; + metadata_file_options.temperature = file->file_temperature; + metadata_file_options.file_checksum = file->file_checksum; + metadata_file_options.file_checksum_func_name = + file->file_checksum_func_name; + IOStatus metadata_io_s = MaybeGetFileOpenMetadata( + fs_.get(), file->internal_file_path, metadata_file_options, + &file->file_open_metadata); + metadata_io_s.PermitUncheckedError(); + } // We use the import time as the ancester time. This is the time the data // is written to the database. @@ -713,6 +724,7 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( : cfd_->NewEpochNumber(), // orders files ingested to L0 file->file_checksum, file->file_checksum_func_name, file->unique_id, 0, tail_size, file->user_defined_timestamps_persisted, "", ""); + f_metadata.file_open_metadata = file->file_open_metadata; f_metadata.temperature = file->file_temperature; f_metadata.marked_for_compaction = marked_for_compaction; // Extract min/max timestamps from table properties for UDT support. diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index d9ecf43da1b4..4a5f6b4838be 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -168,6 +168,8 @@ struct IngestedFileInfo : public KeyRangeInfo { std::string file_checksum; // The name of checksum function that generate the checksum std::string file_checksum_func_name; + // Opaque filesystem metadata that can accelerate future opens. + std::string file_open_metadata; // The temperature of the file to be ingested Temperature file_temperature = Temperature::kUnknown; // Unique id of the file to be ingested diff --git a/db/flush_job.cc b/db/flush_job.cc index 65ab08a2ecc8..dbedfced298d 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1122,7 +1122,8 @@ Status FlushJob::WriteLevel0Table() { meta_.file_checksum, meta_.file_checksum_func_name, meta_.unique_id, meta_.compensated_range_deletion_size, meta_.tail_size, meta_.user_defined_timestamps_persisted, - meta_.min_timestamp, meta_.max_timestamp); + meta_.min_timestamp, meta_.max_timestamp, + meta_.file_open_metadata); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); for (auto& addition : external_blob_file_additions_) { diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 3033f1cf41e2..6a77c455cd94 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -198,9 +198,21 @@ Status ImportColumnFamilyJob::Run() { cfd_->GetLatestMutableCFOptions().bottommost_file_compaction_delay, cfd_->current()->version_set()->offpeak_time_option()); for (size_t j = 0; s.ok() && j < files_to_import_[i].size(); ++j) { - const auto& f = files_to_import_[i][j]; + auto& f = files_to_import_[i][j]; const auto& file_metadata = *metadatas_[i][j]; + if (db_options_.fast_sst_open) { + FileOptions metadata_file_options{env_options_}; + metadata_file_options.temperature = file_metadata.temperature; + metadata_file_options.file_checksum = file_metadata.file_checksum; + metadata_file_options.file_checksum_func_name = + file_metadata.file_checksum_func_name; + IOStatus metadata_io_s = MaybeGetFileOpenMetadata( + fs_.get(), f.internal_file_path, metadata_file_options, + &f.file_open_metadata); + metadata_io_s.PermitUncheckedError(); + } + uint64_t tail_size = FileMetaData::CalculateTailSize(f.fd.GetFileSize(), f.table_properties); @@ -214,7 +226,10 @@ Status ImportColumnFamilyJob::Run() { kUnknownFileChecksum, kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size, static_cast( - f.table_properties.user_defined_timestamps_persisted)); + f.table_properties.user_defined_timestamps_persisted), + "", + "", + f.file_open_metadata); s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 0a6f9d6a3905..a6124ce88d79 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -15,6 +15,138 @@ namespace ROCKSDB_NAMESPACE { +class CloseDBOnExit { + public: + explicit CloseDBOnExit(DBTestBase* test) : test_(test) {} + ~CloseDBOnExit() { test_->Close(); } + + private: + DBTestBase* test_; +}; + +class ImportMetadataAwareRandomAccessFile : public FSRandomAccessFileWrapper { + public: + ImportMetadataAwareRandomAccessFile( + std::unique_ptr target, std::string metadata) + : FSRandomAccessFileWrapper(target.get()), + guard_(std::move(target)), + metadata_(std::move(metadata)) {} + + IOStatus GetFileOpenMetadata(std::string* metadata) override { + *metadata = metadata_; + return IOStatus::OK(); + } + + private: + std::unique_ptr guard_; + std::string metadata_; +}; + +class ImportFileOpenMetadataCapturingFS : public FileSystemWrapper { + public: + explicit ImportFileOpenMetadataCapturingFS( + const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { + return "ImportFileOpenMetadataCapturingFS"; + } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + if (fname.find(".sst") != std::string::npos) { + std::lock_guard lock(mu_); + last_open_metadata_ = + opts.file_metadata != nullptr ? *opts.file_metadata : ""; + if (!last_open_metadata_.empty()) { + ++open_with_metadata_count_; + } + } + + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + if (!s.ok() || fname.find(".sst") == std::string::npos) { + *result = std::move(file); + return s; + } + + result->reset( + new CapturingRandomAccessFile(std::move(file), MetadataForFile(fname), + this)); + return s; + } + + std::string GetLastGeneratedMetadata() { + std::lock_guard lock(mu_); + return last_generated_metadata_; + } + + std::string GetLastOpenMetadata() { + std::lock_guard lock(mu_); + return last_open_metadata_; + } + + int GetMetadataRequestCount() { + std::lock_guard lock(mu_); + return metadata_request_count_; + } + + int GetOpenWithMetadataCount() { + std::lock_guard lock(mu_); + return open_with_metadata_count_; + } + + void Reset() { + std::lock_guard lock(mu_); + last_generated_metadata_.clear(); + last_open_metadata_.clear(); + metadata_request_count_ = 0; + open_with_metadata_count_ = 0; + } + + private: + class CapturingRandomAccessFile : public FSRandomAccessFileWrapper { + public: + CapturingRandomAccessFile(std::unique_ptr&& target, + std::string metadata, + ImportFileOpenMetadataCapturingFS* owner) + : FSRandomAccessFileWrapper(target.get()), + guard_(std::move(target)), + metadata_(std::move(metadata)), + owner_(owner) {} + + IOStatus GetFileOpenMetadata(std::string* metadata) override { + owner_->RecordMetadataRequest(metadata_); + *metadata = metadata_; + return IOStatus::OK(); + } + + private: + std::unique_ptr guard_; + std::string metadata_; + ImportFileOpenMetadataCapturingFS* owner_; + }; + + static std::string MetadataForFile(const std::string& fname) { + return "file-open-metadata:" + fname; + } + + void RecordMetadataRequest(const std::string& metadata) { + std::lock_guard lock(mu_); + last_generated_metadata_ = metadata; + ++metadata_request_count_; + } + + std::mutex mu_; + std::string last_generated_metadata_; + std::string last_open_metadata_; + int metadata_request_count_ = 0; + int open_with_metadata_count_ = 0; +}; + class ImportColumnFamilyTest : public DBTestBase { public: ImportColumnFamilyTest() @@ -452,6 +584,52 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { } } +TEST_F(ImportColumnFamilyTest, FastSstOpenAfterImportColumnFamily) { + auto capturing_fs = std::make_shared( + env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = CurrentOptions(); + options.env = env.get(); + options.fast_sst_open = true; + CreateAndReopenWithCF({"koko"}, options); + CloseDBOnExit close_db(this); + + for (int i = 0; i < 20; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_value")); + } + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", + ImportColumnFamilyOptions(), + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + ASSERT_GT(capturing_fs->GetMetadataRequestCount(), 0); + std::string expected_metadata = capturing_fs->GetLastGeneratedMetadata(); + ASSERT_FALSE(expected_metadata.empty()); + + handles_.push_back(import_cfh_); + import_cfh_ = nullptr; + delete metadata_ptr_; + metadata_ptr_ = nullptr; + + capturing_fs->Reset(); + ReopenWithColumnFamilies({"default", "koko", "toto"}, options); + + ASSERT_EQ(Key(0) + "_value", Get(2, Key(0))); + ASSERT_GT(capturing_fs->GetOpenWithMetadataCount(), 0); + ASSERT_EQ(expected_metadata, capturing_fs->GetLastOpenMetadata()); + + Close(); +} + TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { Options options = CurrentOptions(); CreateAndReopenWithCF({"koko"}, options); diff --git a/db/repair.cc b/db/repair.cc index 941d69dedc11..ffb5655f8523 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -714,7 +714,7 @@ class Repairer { meta.file_checksum_func_name, meta.unique_id, meta.compensated_range_deletion_size, meta.tail_size, meta.user_defined_timestamps_persisted, meta.min_timestamp, - meta.max_timestamp); + meta.max_timestamp, meta.file_open_metadata); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/table_cache.cc b/db/table_cache.cc index d864444b9f34..acb0acd0e183 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -60,6 +60,19 @@ void AppendVarint64(IterKey* key, uint64_t v) { key->TrimAppend(key->Size(), buf, ptr - buf); } +void MaybeCaptureFileOpenMetadata(bool fast_sst_open, + FSRandomAccessFile* file, + std::string* file_open_metadata) { + if (!fast_sst_open || file == nullptr || file_open_metadata == nullptr) { + return; + } + file_open_metadata->clear(); + IOStatus io_s = file->GetFileOpenMetadata(file_open_metadata); + if (!io_s.ok()) { + file_open_metadata->clear(); + } +} + } // anonymous namespace const int kLoadConcurency = 128; @@ -95,7 +108,8 @@ Status TableCache::GetTableReader( HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const MutableCFOptions& mutable_cf_options, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, - size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature, + std::string* file_open_metadata) { std::string fname = TableFileName( ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); std::unique_ptr file; @@ -103,6 +117,11 @@ Status TableCache::GetTableReader( fopts.temperature = file_temperature; fopts.file_checksum = file_meta.file_checksum; fopts.file_checksum_func_name = file_meta.file_checksum_func_name; + std::string attached_file_open_metadata; + if (ioptions_.fast_sst_open && !file_meta.file_open_metadata.empty()) { + attached_file_open_metadata = file_meta.file_open_metadata; + fopts.file_metadata = &attached_file_open_metadata; + } Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", const_cast(&s)); @@ -127,6 +146,8 @@ Status TableCache::GetTableReader( } if (s.ok()) { + MaybeCaptureFileOpenMetadata(ioptions_.fast_sst_open, file.get(), + file_open_metadata); if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(FSRandomAccessFile::kRandom); } @@ -181,7 +202,7 @@ Status TableCache::FindTable( const bool no_io, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature, - bool pin_table_handle) { + bool pin_table_handle, std::string* file_open_metadata) { assert(out_table_reader != nullptr && *out_table_reader == nullptr); assert(handle != nullptr && *handle == nullptr); PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); @@ -226,7 +247,8 @@ Status TableCache::FindTable( false /* sequential mode */, file_read_hist, &table_reader, mutable_cf_options, skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin, file_temperature); + max_file_size_for_l0_meta_pin, file_temperature, + file_open_metadata); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -280,7 +302,7 @@ InternalIterator* TableCache::NewIterator( const InternalKey* largest_compaction_key, bool allow_unprepared_value, const SequenceNumber* read_seqno, std::unique_ptr* range_del_iter, - bool maybe_pin_table_handle) { + bool maybe_pin_table_handle, std::string* file_open_metadata) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; @@ -298,7 +320,8 @@ InternalIterator* TableCache::NewIterator( file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, max_file_size_for_l0_meta_pin, file_meta.temperature, - maybe_pin_table_handle && should_pin_table_handles_); + maybe_pin_table_handle && should_pin_table_handles_, + file_open_metadata); InternalIterator* result = nullptr; if (s.ok()) { if (options.table_filter && diff --git a/db/table_cache.h b/db/table_cache.h index 4f1376686658..27094a5a485b 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -100,7 +100,8 @@ class TableCache { const InternalKey* largest_compaction_key, bool allow_unprepared_value, const SequenceNumber* range_del_read_seqno = nullptr, std::unique_ptr* range_del_iter = nullptr, - bool maybe_pin_table_handle = false); + bool maybe_pin_table_handle = false, + std::string* file_open_metadata = nullptr); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -197,7 +198,8 @@ class TableCache { bool prefetch_index_and_filter_in_cache = true, size_t max_file_size_for_l0_meta_pin = 0, Temperature file_temperature = Temperature::kUnknown, - bool pin_table_handle = false); + bool pin_table_handle = false, + std::string* file_open_metadata = nullptr); // Get the table properties of a given table. // @no_io: indicates if we should load table to the cache if it is not present @@ -277,7 +279,8 @@ class TableCache { bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, size_t max_file_size_for_l0_meta_pin = 0, - Temperature file_temperature = Temperature::kUnknown); + Temperature file_temperature = Temperature::kUnknown, + std::string* file_open_metadata = nullptr); // Update the max_covering_tombstone_seq in the GetContext for each key based // on the range deletions in the table diff --git a/db/version_edit.cc b/db/version_edit.cc index ea0a9b62be30..490a13531aa7 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -295,6 +295,10 @@ void VersionEdit::EncodeToNewFile4(const FileMetaData& f, int level, PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); } + if (!f.file_open_metadata.empty()) { + PutVarint32(dst, NewFileCustomTag::kFileOpenMetadata); + PutLengthPrefixedSlice(dst, Slice(f.file_open_metadata)); + } if (f.fd.GetPathId() != 0) { PutVarint32(dst, NewFileCustomTag::kPathId); @@ -449,6 +453,9 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input, int& max_level, case kFileChecksumFuncName: f.file_checksum_func_name = field.ToString(); break; + case kFileOpenMetadata: + f.file_open_metadata = field.ToString(); + break; case kNeedCompaction: if (field.size() != 1) { return "need_compaction field wrong size"; @@ -980,6 +987,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(Slice(f.file_checksum).ToString(true)); r.append(" file_checksum_func_name: "); r.append(f.file_checksum_func_name); + if (!f.file_open_metadata.empty()) { + r.append(" file_open_metadata: "); + r.append(Slice(f.file_open_metadata).ToString(true)); + } if (f.temperature != Temperature::kUnknown) { r.append(" temperature: "); // Maybe change to human readable format whenthe feature becomes @@ -1106,6 +1117,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "EpochNumber" << f.epoch_number; jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); jw << "FileChecksumFuncName" << f.file_checksum_func_name; + if (!f.file_open_metadata.empty()) { + jw << "FileOpenMetadata" << Slice(f.file_open_metadata).ToString(true); + } if (f.temperature != Temperature::kUnknown) { jw << "temperature" << std::to_string(static_cast(f.temperature)); } diff --git a/db/version_edit.h b/db/version_edit.h index ffd6012e8e2f..3b84a00a322a 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -112,6 +112,7 @@ enum NewFileCustomTag : uint32_t { kCompensatedRangeDeletionSize = 14, kTailSize = 15, kUserDefinedTimestampsPersisted = 16, + kFileOpenMetadata = 17, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -310,6 +311,10 @@ struct FileMetaData { // File checksum function name std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + // Opaque file-system-provided metadata that can accelerate reopening the + // file later. + std::string file_open_metadata; + // SST unique id UniqueId64x2 unique_id{}; @@ -344,7 +349,8 @@ struct FileMetaData { const uint64_t _compensated_range_deletion_size, uint64_t _tail_size, bool _user_defined_timestamps_persisted, const std::string& _min_timestamp, - const std::string& _max_timestamp) + const std::string& _max_timestamp, + const std::string& _file_open_metadata = "") : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -357,6 +363,7 @@ struct FileMetaData { epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), + file_open_metadata(_file_open_metadata), unique_id(std::move(_unique_id)), tail_size(_tail_size), user_defined_timestamps_persisted(_user_defined_timestamps_persisted), @@ -445,8 +452,8 @@ struct FileMetaData { usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE usage += smallest.size() + largest.size() + file_checksum.size() + - file_checksum_func_name.size() + min_timestamp.size() + - max_timestamp.size(); + file_checksum_func_name.size() + file_open_metadata.size() + + min_timestamp.size() + max_timestamp.size(); return usage; } @@ -799,7 +806,8 @@ class VersionEdit { const uint64_t compensated_range_deletion_size, uint64_t tail_size, bool user_defined_timestamps_persisted, const std::string& min_timestamp = "", - const std::string& max_timestamp = "") { + const std::string& max_timestamp = "", + const std::string& file_open_metadata = "") { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -809,7 +817,8 @@ class VersionEdit { oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, compensated_range_deletion_size, tail_size, - user_defined_timestamps_persisted, min_timestamp, max_timestamp)); + user_defined_timestamps_persisted, min_timestamp, max_timestamp, + file_open_metadata)); files_to_quarantine_.push_back(file); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index d5f6beee93cc..41a54d80bb8c 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -130,6 +130,27 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { ASSERT_FALSE(parsed.GetPersistUserDefinedTimestamps()); } +TEST_F(VersionEditTest, EncodeDecodeFileOpenMetadata) { + FileMetaData file; + file.fd = FileDescriptor(300, 0, 100, 500, 600); + file.smallest = InternalKey("foo", 500, kTypeValue); + file.largest = InternalKey("zoo", 600, kTypeDeletion); + file.epoch_number = 300; + file.file_open_metadata = "opaque-open-metadata"; + + VersionEdit edit; + edit.AddFile(3, file); + + std::string encoded; + ASSERT_TRUE(edit.EncodeTo(&encoded, 0 /* ts_sz */)); + + VersionEdit parsed; + ASSERT_OK(parsed.DecodeFrom(encoded)); + ASSERT_EQ(parsed.GetNewFiles().size(), 1U); + ASSERT_EQ(parsed.GetNewFiles()[0].second.file_open_metadata, + file.file_open_metadata); +} + TEST_F(VersionEditTest, EncodeDecodeNewFile4HandleFileBoundary) { static const uint64_t kBig = 1ull << 50; size_t ts_sz = 16; @@ -815,6 +836,7 @@ class SubcompactionProgressTest : public VersionEditTest { file.epoch_number = kTestEpochNumber; file.file_checksum = "checksum_" + std::to_string(file_number); file.file_checksum_func_name = kTestChecksumFuncName; + file.file_open_metadata = "open_metadata_" + std::to_string(file_number); file.marked_for_compaction = false; file.temperature = Temperature::kUnknown; return file; @@ -883,6 +905,7 @@ class SubcompactionProgressTest : public VersionEditTest { ASSERT_EQ(actual.epoch_number, expected.epoch_number); ASSERT_EQ(actual.file_checksum, expected.file_checksum); ASSERT_EQ(actual.file_checksum_func_name, expected.file_checksum_func_name); + ASSERT_EQ(actual.file_open_metadata, expected.file_open_metadata); ASSERT_EQ(actual.marked_for_compaction, expected.marked_for_compaction); ASSERT_EQ(actual.temperature, expected.temperature); } diff --git a/db/version_set.cc b/db/version_set.cc index faf06868e4fd..06faa0c7e38f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1845,6 +1845,11 @@ Status Version::GetTableProperties(const ReadOptions& read_options, FileOptions fopts = file_options_; fopts.file_checksum = file_meta->file_checksum; fopts.file_checksum_func_name = file_meta->file_checksum_func_name; + std::string file_open_metadata; + if (ioptions.fast_sst_open && !file_meta->file_open_metadata.empty()) { + file_open_metadata = file_meta->file_open_metadata; + fopts.file_metadata = &file_open_metadata; + } s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, nullptr); if (!s.ok()) { return s; @@ -7337,7 +7342,7 @@ Status VersionSet::WriteCurrentStateToManifest( f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, f->user_defined_timestamps_persisted, f->min_timestamp, - f->max_timestamp); + f->max_timestamp, f->file_open_metadata); } } diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 6395ecee2d62..3576c898c56f 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -204,6 +204,7 @@ DECLARE_bool(sync); DECLARE_bool(use_fsync); DECLARE_uint64(stats_dump_period_sec); DECLARE_uint64(max_compaction_trigger_wakeup_seconds); +DECLARE_bool(fast_sst_open); DECLARE_uint64(bytes_per_sync); DECLARE_uint64(wal_bytes_per_sync); DECLARE_int32(kill_random_test); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 881d223bc4ca..1d5a2c40d0ef 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1371,6 +1371,9 @@ DEFINE_uint64( ROCKSDB_NAMESPACE::Options().max_compaction_trigger_wakeup_seconds, "Sets DB option max_compaction_trigger_wakeup_seconds."); +DEFINE_bool(fast_sst_open, ROCKSDB_NAMESPACE::Options().fast_sst_open, + "Sets DB option fast_sst_open."); + DEFINE_bool(verification_only, false, "If true, tests will only execute verification step"); extern "C" bool RocksDbIOUringEnable() { return true; } diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index dbfd4e0d2f40..3022e47979c6 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -4647,6 +4647,7 @@ void InitializeOptionsFromFlags( static_cast(FLAGS_stats_dump_period_sec); options.max_compaction_trigger_wakeup_seconds = FLAGS_max_compaction_trigger_wakeup_seconds; + options.fast_sst_open = FLAGS_fast_sst_open; options.ttl = FLAGS_compaction_ttl; options.enable_pipelined_write = FLAGS_enable_pipelined_write; options.enable_write_thread_adaptive_yield = diff --git a/file/file_util.cc b/file/file_util.cc index c44d799b8ce4..a754f47a8ad2 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -285,6 +285,27 @@ IOStatus GenerateOneFileChecksum( return IOStatus::OK(); } +IOStatus MaybeGetFileOpenMetadata(FileSystem* fs, const std::string& file_path, + const FileOptions& file_options, + std::string* file_open_metadata) { + assert(file_open_metadata != nullptr); + file_open_metadata->clear(); + + std::unique_ptr file; + FileOptions fopts = file_options; + fopts.file_metadata = nullptr; + IOStatus io_s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); + if (!io_s.ok()) { + return io_s; + } + io_s = file->GetFileOpenMetadata(file_open_metadata); + if (io_s.IsNotSupported()) { + file_open_metadata->clear(); + return IOStatus::OK(); + } + return io_s; +} + Status DestroyDir(Env* env, const std::string& dir) { Status s; if (env->FileExists(dir).IsNotFound()) { diff --git a/file/file_util.h b/file/file_util.h index f460a30caa9b..ad1642bf1134 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -86,6 +86,10 @@ IOStatus GenerateOneFileChecksum( const ReadOptions& read_options, Statistics* stats, SystemClock* clock, const FileOptions& file_options); +IOStatus MaybeGetFileOpenMetadata(FileSystem* fs, const std::string& file_path, + const FileOptions& file_options, + std::string* file_open_metadata); + inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, SystemClock* clock, IOOptions& opts, IODebugContext* dbg = nullptr) { diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 1a169290dbce..a7374a60f0b4 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -214,6 +214,13 @@ struct FileOptions : EnvOptions { // that is forbidden for checking/auditing purposes. std::string file_checksum_func_name; + // This is used to pass file metadata that can be used by the file system + // to accelerate file opening. The content is opaque to RocksDB and is + // left to the file system to interpret. This is especially useful in the + // case of remote file systems to avoid expensive RPCs to retrieve the + // metadata. + std::string* file_metadata = nullptr; + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) @@ -231,7 +238,8 @@ struct FileOptions : EnvOptions { handoff_checksum_type(opts.handoff_checksum_type), write_hint(opts.write_hint), file_checksum(opts.file_checksum), - file_checksum_func_name(opts.file_checksum_func_name) {} + file_checksum_func_name(opts.file_checksum_func_name), + file_metadata(opts.file_metadata) {} FileOptions& operator=(const FileOptions&) = default; }; @@ -1070,6 +1078,15 @@ class FSRandomAccessFile { return IOStatus::NotSupported("GetFileSize Not Supported"); } + // Returns metadata for the file that can be passed back later to the file + // system when reopening this file. This is optional. The implementation + // can return NotSupported. The metadata, if returned, is not mandatory + // for the file system to use when reopening. It can be ignored and the + // only downside is slower file open time. + virtual IOStatus GetFileOpenMetadata(std::string* /*metadata*/) { + return IOStatus::NotSupported("GetFileOpenMetadata not supported"); + } + // If you're adding methods here, remember to add them to // RandomAccessFileWrapper too. }; @@ -1812,6 +1829,10 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile { return target_->GetFileSize(result); } + virtual IOStatus GetFileOpenMetadata(std::string* metadata) override { + return target_->GetFileOpenMetadata(metadata); + } + private: std::unique_ptr guard_; FSRandomAccessFile* target_; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 758b5628c2c4..f7bd052e766b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1781,6 +1781,11 @@ struct DBOptions { // Default: Enabled in kCompactionStyleLevel mode. CompactionStyleSet calculate_sst_write_lifetime_hint_set = { CompactionStyle::kCompactionStyleLevel}; + + // When true, save file system metadata (if supported by the FS) for SST + // files added to the DB in the MANIFEST, and use it to accelerate + // reopening those files later. + bool fast_sst_open = false; // End EXPERIMENTAL }; diff --git a/options/db_options.cc b/options/db_options.cc index d6dd73022e21..37b20a9643ec 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -615,6 +615,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, wal_write_temperature), OptionType::kTemperature, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"fast_sst_open", + {offsetof(struct ImmutableDBOptions, fast_sst_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, }; const std::string OptionsHelper::kDBOptionsName = "DBOptions"; @@ -821,7 +825,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) metadata_write_temperature(options.metadata_write_temperature), wal_write_temperature(options.wal_write_temperature), calculate_sst_write_lifetime_hint_set( - options.calculate_sst_write_lifetime_hint_set) { + options.calculate_sst_write_lifetime_hint_set), + fast_sst_open(options.fast_sst_open) { fs = env->GetFileSystem(); clock = env->GetSystemClock().get(); logger = info_log.get(); @@ -1006,6 +1011,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { temperature_to_string[metadata_write_temperature].c_str()); ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s", temperature_to_string[wal_write_temperature].c_str()); + ROCKS_LOG_HEADER(log, " Options.fast_sst_open: %d", + fast_sst_open); } bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { diff --git a/options/db_options.h b/options/db_options.h index d1ea989cebfb..bc83679e9e98 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -106,6 +106,7 @@ struct ImmutableDBOptions { Temperature metadata_write_temperature; Temperature wal_write_temperature; CompactionStyleSet calculate_sst_write_lifetime_hint_set; + bool fast_sst_open; // Beginning convenience/helper objects that are not part of the base // DBOptions diff --git a/options/options_helper.cc b/options/options_helper.cc index 2223712f8d62..6d8df166cde4 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -206,6 +206,7 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.compaction_service = immutable_db_options.compaction_service; options.calculate_sst_write_lifetime_hint_set = immutable_db_options.calculate_sst_write_lifetime_hint_set; + options.fast_sst_open = immutable_db_options.fast_sst_open; } ColumnFamilyOptions BuildColumnFamilyOptions( diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index ff24369ad517..08420e4bca84 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -486,6 +486,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "follower_catchup_retry_wait_ms=789;" "metadata_write_temperature=kCold;" "wal_write_temperature=kHot;" + "fast_sst_open=true;" "background_close_inactive_wals=true;" "write_dbid_to_manifest=true;" "write_identity_file=true;" diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 50645f2efa1a..d3e2b5c54398 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -198,6 +198,7 @@ def apply_random_seed_per_iteration(): "nooverwritepercent": 1, "open_files": lambda: random.choice([-1, -1, 100, 500000]), "open_files_async": lambda: random.choice([0, 1]), + "fast_sst_open": lambda: random.choice([0, 1]), "optimize_filters_for_memory": lambda: random.randint(0, 1), "partition_filters": lambda: random.randint(0, 1), "partition_pinning": lambda: random.randint(0, 3), @@ -273,6 +274,7 @@ def apply_random_seed_per_iteration(): "use_multi_get_entity": lambda: random.choice([0] * 7 + [1]), "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), "max_compaction_trigger_wakeup_seconds": lambda: random.choice([43200, 600, 30]), + "fast_sst_open": lambda: random.choice([0, 1]), "read_triggered_compaction_threshold": lambda: random.choice([0.0, 0.001, 0.01]), "daily_offpeak_time_utc": lambda: random.choice( ["", "", "00:00-23:59", "04:00-08:00", "23:30-03:15"]