builder.cc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "db/builder.h"
  10. #include <algorithm>
  11. #include <deque>
  12. #include <vector>
  13. #include "db/compaction/compaction_iterator.h"
  14. #include "db/dbformat.h"
  15. #include "db/event_helpers.h"
  16. #include "db/internal_stats.h"
  17. #include "db/merge_helper.h"
  18. #include "db/range_del_aggregator.h"
  19. #include "db/table_cache.h"
  20. #include "db/version_edit.h"
  21. #include "file/filename.h"
  22. #include "file/read_write_util.h"
  23. #include "file/writable_file_writer.h"
  24. #include "monitoring/iostats_context_imp.h"
  25. #include "monitoring/thread_status_util.h"
  26. #include "rocksdb/db.h"
  27. #include "rocksdb/env.h"
  28. #include "rocksdb/iterator.h"
  29. #include "rocksdb/options.h"
  30. #include "rocksdb/table.h"
  31. #include "table/block_based/block_based_table_builder.h"
  32. #include "table/format.h"
  33. #include "table/internal_iterator.h"
  34. #include "test_util/sync_point.h"
  35. #include "util/stop_watch.h"
  36. namespace ROCKSDB_NAMESPACE {
  37. class TableFactory;
  38. TableBuilder* NewTableBuilder(
  39. const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
  40. const InternalKeyComparator& internal_comparator,
  41. const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
  42. int_tbl_prop_collector_factories,
  43. uint32_t column_family_id, const std::string& column_family_name,
  44. WritableFileWriter* file, const CompressionType compression_type,
  45. uint64_t sample_for_compression, const CompressionOptions& compression_opts,
  46. int level, const bool skip_filters, const uint64_t creation_time,
  47. const uint64_t oldest_key_time, const uint64_t target_file_size,
  48. const uint64_t file_creation_time) {
  49. assert((column_family_id ==
  50. TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
  51. column_family_name.empty());
  52. return ioptions.table_factory->NewTableBuilder(
  53. TableBuilderOptions(ioptions, moptions, internal_comparator,
  54. int_tbl_prop_collector_factories, compression_type,
  55. sample_for_compression, compression_opts,
  56. skip_filters, column_family_name, level,
  57. creation_time, oldest_key_time, target_file_size,
  58. file_creation_time),
  59. column_family_id, file);
  60. }
  61. Status BuildTable(
  62. const std::string& dbname, Env* env, FileSystem* fs,
  63. const ImmutableCFOptions& ioptions,
  64. const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
  65. TableCache* table_cache, InternalIterator* iter,
  66. std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
  67. range_del_iters,
  68. FileMetaData* meta, const InternalKeyComparator& internal_comparator,
  69. const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
  70. int_tbl_prop_collector_factories,
  71. uint32_t column_family_id, const std::string& column_family_name,
  72. std::vector<SequenceNumber> snapshots,
  73. SequenceNumber earliest_write_conflict_snapshot,
  74. SnapshotChecker* snapshot_checker, const CompressionType compression,
  75. uint64_t sample_for_compression, const CompressionOptions& compression_opts,
  76. bool paranoid_file_checks, InternalStats* internal_stats,
  77. TableFileCreationReason reason, EventLogger* event_logger, int job_id,
  78. const Env::IOPriority io_priority, TableProperties* table_properties,
  79. int level, const uint64_t creation_time, const uint64_t oldest_key_time,
  80. Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) {
  81. assert((column_family_id ==
  82. TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
  83. column_family_name.empty());
  84. // Reports the IOStats for flush for every following bytes.
  85. const size_t kReportFlushIOStatsEvery = 1048576;
  86. Status s;
  87. meta->fd.file_size = 0;
  88. iter->SeekToFirst();
  89. std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
  90. new CompactionRangeDelAggregator(&internal_comparator, snapshots));
  91. for (auto& range_del_iter : range_del_iters) {
  92. range_del_agg->AddTombstones(std::move(range_del_iter));
  93. }
  94. std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
  95. meta->fd.GetPathId());
  96. #ifndef ROCKSDB_LITE
  97. EventHelpers::NotifyTableFileCreationStarted(
  98. ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
  99. #endif // !ROCKSDB_LITE
  100. TableProperties tp;
  101. if (iter->Valid() || !range_del_agg->IsEmpty()) {
  102. TableBuilder* builder;
  103. std::unique_ptr<WritableFileWriter> file_writer;
  104. // Currently we only enable dictionary compression during compaction to the
  105. // bottommost level.
  106. CompressionOptions compression_opts_for_flush(compression_opts);
  107. compression_opts_for_flush.max_dict_bytes = 0;
  108. compression_opts_for_flush.zstd_max_train_bytes = 0;
  109. {
  110. std::unique_ptr<FSWritableFile> file;
  111. #ifndef NDEBUG
  112. bool use_direct_writes = file_options.use_direct_writes;
  113. TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
  114. #endif // !NDEBUG
  115. s = NewWritableFile(fs, fname, &file, file_options);
  116. if (!s.ok()) {
  117. EventHelpers::LogAndNotifyTableFileCreationFinished(
  118. event_logger, ioptions.listeners, dbname, column_family_name, fname,
  119. job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
  120. return s;
  121. }
  122. file->SetIOPriority(io_priority);
  123. file->SetWriteLifeTimeHint(write_hint);
  124. file_writer.reset(new WritableFileWriter(
  125. std::move(file), fname, file_options, env, ioptions.statistics,
  126. ioptions.listeners, ioptions.sst_file_checksum_func));
  127. builder = NewTableBuilder(
  128. ioptions, mutable_cf_options, internal_comparator,
  129. int_tbl_prop_collector_factories, column_family_id,
  130. column_family_name, file_writer.get(), compression,
  131. sample_for_compression, compression_opts_for_flush, level,
  132. false /* skip_filters */, creation_time, oldest_key_time,
  133. 0 /*target_file_size*/, file_creation_time);
  134. }
  135. MergeHelper merge(env, internal_comparator.user_comparator(),
  136. ioptions.merge_operator, nullptr, ioptions.info_log,
  137. true /* internal key corruption is not ok */,
  138. snapshots.empty() ? 0 : snapshots.back(),
  139. snapshot_checker);
  140. CompactionIterator c_iter(
  141. iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
  142. &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
  143. ShouldReportDetailedTime(env, ioptions.statistics),
  144. true /* internal key corruption is not ok */, range_del_agg.get());
  145. c_iter.SeekToFirst();
  146. for (; c_iter.Valid(); c_iter.Next()) {
  147. const Slice& key = c_iter.key();
  148. const Slice& value = c_iter.value();
  149. const ParsedInternalKey& ikey = c_iter.ikey();
  150. builder->Add(key, value);
  151. meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
  152. // TODO(noetzli): Update stats after flush, too.
  153. if (io_priority == Env::IO_HIGH &&
  154. IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
  155. ThreadStatusUtil::SetThreadOperationProperty(
  156. ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
  157. }
  158. }
  159. auto range_del_it = range_del_agg->NewIterator();
  160. for (range_del_it->SeekToFirst(); range_del_it->Valid();
  161. range_del_it->Next()) {
  162. auto tombstone = range_del_it->Tombstone();
  163. auto kv = tombstone.Serialize();
  164. builder->Add(kv.first.Encode(), kv.second);
  165. meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
  166. tombstone.seq_, internal_comparator);
  167. }
  168. // Finish and check for builder errors
  169. tp = builder->GetTableProperties();
  170. bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
  171. s = c_iter.status();
  172. if (!s.ok() || empty) {
  173. builder->Abandon();
  174. } else {
  175. s = builder->Finish();
  176. }
  177. if (s.ok() && !empty) {
  178. uint64_t file_size = builder->FileSize();
  179. meta->fd.file_size = file_size;
  180. meta->marked_for_compaction = builder->NeedCompact();
  181. assert(meta->fd.GetFileSize() > 0);
  182. tp = builder->GetTableProperties(); // refresh now that builder is finished
  183. if (table_properties) {
  184. *table_properties = tp;
  185. }
  186. // Add the checksum information to file metadata.
  187. meta->file_checksum = builder->GetFileChecksum();
  188. meta->file_checksum_func_name = builder->GetFileChecksumFuncName();
  189. }
  190. delete builder;
  191. // Finish and check for file errors
  192. if (s.ok() && !empty) {
  193. StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
  194. s = file_writer->Sync(ioptions.use_fsync);
  195. }
  196. if (s.ok() && !empty) {
  197. s = file_writer->Close();
  198. }
  199. if (s.ok() && !empty) {
  200. // Verify that the table is usable
  201. // We set for_compaction to false and don't OptimizeForCompactionTableRead
  202. // here because this is a special case after we finish the table building
  203. // No matter whether use_direct_io_for_flush_and_compaction is true,
  204. // we will regrad this verification as user reads since the goal is
  205. // to cache it here for further user reads
  206. std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
  207. ReadOptions(), file_options, internal_comparator, *meta,
  208. nullptr /* range_del_agg */,
  209. mutable_cf_options.prefix_extractor.get(), nullptr,
  210. (internal_stats == nullptr) ? nullptr
  211. : internal_stats->GetFileReadHist(0),
  212. TableReaderCaller::kFlush, /*arena=*/nullptr,
  213. /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
  214. /*largest_compaction_key*/ nullptr));
  215. s = it->status();
  216. if (s.ok() && paranoid_file_checks) {
  217. for (it->SeekToFirst(); it->Valid(); it->Next()) {
  218. }
  219. s = it->status();
  220. }
  221. }
  222. }
  223. // Check for input iterator errors
  224. if (!iter->status().ok()) {
  225. s = iter->status();
  226. }
  227. if (!s.ok() || meta->fd.GetFileSize() == 0) {
  228. fs->DeleteFile(fname, IOOptions(), nullptr);
  229. }
  230. if (meta->fd.GetFileSize() == 0) {
  231. fname = "(nil)";
  232. }
  233. // Output to event logger and fire events.
  234. EventHelpers::LogAndNotifyTableFileCreationFinished(
  235. event_logger, ioptions.listeners, dbname, column_family_name, fname,
  236. job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
  237. return s;
  238. }
  239. } // namespace ROCKSDB_NAMESPACE