hash.cc 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "util/hash.h"
  10. #include <string>
  11. #include "port/lang.h"
  12. #include "util/coding.h"
  13. #include "util/hash128.h"
  14. #include "util/math128.h"
  15. #include "util/xxhash.h"
  16. #include "util/xxph3.h"
  17. namespace ROCKSDB_NAMESPACE {
  18. uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
  19. uint32_t Hash(const char* data, size_t n, uint32_t seed) {
  20. // MurmurHash1 - fast but mediocre quality
  21. // https://github.com/aappleby/smhasher/wiki/MurmurHash1
  22. //
  23. const uint32_t m = 0xc6a4a793;
  24. const uint32_t r = 24;
  25. const char* limit = data + n;
  26. uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
  27. // Pick up four bytes at a time
  28. while (data + 4 <= limit) {
  29. uint32_t w = DecodeFixed32(data);
  30. data += 4;
  31. h += w;
  32. h *= m;
  33. h ^= (h >> 16);
  34. }
  35. // Pick up remaining bytes
  36. switch (limit - data) {
  37. // Note: The original hash implementation used data[i] << shift, which
  38. // promotes the char to int and then performs the shift. If the char is
  39. // negative, the shift is undefined behavior in C++. The hash algorithm is
  40. // part of the format definition, so we cannot change it; to obtain the same
  41. // behavior in a legal way we just cast to uint32_t, which will do
  42. // sign-extension. To guarantee compatibility with architectures where chars
  43. // are unsigned we first cast the char to int8_t.
  44. case 3:
  45. h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
  46. FALLTHROUGH_INTENDED;
  47. case 2:
  48. h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
  49. FALLTHROUGH_INTENDED;
  50. case 1:
  51. h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
  52. h *= m;
  53. h ^= (h >> r);
  54. break;
  55. }
  56. return h;
  57. }
  58. // We are standardizing on a preview release of XXH3, because that's
  59. // the best available at time of standardizing.
  60. //
  61. // In testing (mostly Intel Skylake), this hash function is much more
  62. // thorough than Hash32 and is almost universally faster. Hash() only
  63. // seems faster when passing runtime-sized keys of the same small size
  64. // (less than about 24 bytes) thousands of times in a row; this seems
  65. // to allow the branch predictor to work some magic. XXH3's speed is
  66. // much less dependent on branch prediction.
  67. //
  68. // Hashing with a prefix extractor is potentially a common case of
  69. // hashing objects of small, predictable size. We could consider
  70. // bundling hash functions specialized for particular lengths with
  71. // the prefix extractors.
  72. uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
  73. return XXPH3_64bits_withSeed(data, n, seed);
  74. }
  75. uint64_t Hash64(const char* data, size_t n) {
  76. // Same as seed = 0
  77. return XXPH3_64bits(data, n);
  78. }
  79. uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
  80. // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
  81. size_t concat_len = 0;
  82. for (int i = 0; i < data.num_parts; ++i) {
  83. concat_len += data.parts[i].size();
  84. }
  85. std::string concat_data;
  86. concat_data.reserve(concat_len);
  87. for (int i = 0; i < data.num_parts; ++i) {
  88. concat_data.append(data.parts[i].data(), data.parts[i].size());
  89. }
  90. assert(concat_data.size() == concat_len);
  91. return NPHash64(concat_data.data(), concat_len, seed);
  92. }
  93. Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
  94. auto h = XXH3_128bits_withSeed(data, n, seed);
  95. return (Unsigned128{h.high64} << 64) | (h.low64);
  96. }
  97. Unsigned128 Hash128(const char* data, size_t n) {
  98. // Same as seed = 0
  99. auto h = XXH3_128bits(data, n);
  100. return (Unsigned128{h.high64} << 64) | (h.low64);
  101. }
  102. void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
  103. // Same as seed = 0
  104. auto h = XXH3_128bits(data, n);
  105. *high64 = h.high64;
  106. *low64 = h.low64;
  107. }
  108. void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
  109. uint64_t* low64) {
  110. auto h = XXH3_128bits_withSeed(data, n, seed);
  111. *high64 = h.high64;
  112. *low64 = h.low64;
  113. }
  114. namespace {
  115. inline uint64_t XXH3_avalanche(uint64_t h64) {
  116. h64 ^= h64 >> 37;
  117. h64 *= 0x165667919E3779F9U;
  118. h64 ^= h64 >> 32;
  119. return h64;
  120. }
  121. inline uint64_t XXH3_unavalanche(uint64_t h64) {
  122. h64 ^= h64 >> 32;
  123. h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U
  124. h64 ^= h64 >> 37;
  125. return h64;
  126. }
  127. } // namespace
  128. void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
  129. uint64_t* out_high64, uint64_t* out_low64) {
  130. // Adapted from XXH3_len_9to16_128b
  131. const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
  132. const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
  133. Unsigned128 tmp128 =
  134. Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
  135. uint64_t lo = Lower64of128(tmp128);
  136. uint64_t hi = Upper64of128(tmp128);
  137. lo += 0x3c0000000000000U; // (len - 1) << 54
  138. in_high64 ^= bitfliph;
  139. hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
  140. lo ^= EndianSwapValue(hi);
  141. tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
  142. lo = Lower64of128(tmp128);
  143. hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
  144. *out_low64 = XXH3_avalanche(lo);
  145. *out_high64 = XXH3_avalanche(hi);
  146. }
  147. void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
  148. uint64_t* out_high64, uint64_t* out_low64) {
  149. // Inverted above (also consulting XXH3_len_9to16_128b)
  150. const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
  151. const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
  152. uint64_t lo = XXH3_unavalanche(in_low64);
  153. uint64_t hi = XXH3_unavalanche(in_high64);
  154. lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
  155. hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
  156. hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
  157. lo ^= EndianSwapValue(hi);
  158. lo -= 0x3c0000000000000U;
  159. lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U
  160. hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
  161. uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77
  162. hi -= tmp32;
  163. hi = (hi & 0xFFFFFFFF00000000U) -
  164. ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
  165. hi ^= bitfliph;
  166. lo ^= hi ^ bitflipl;
  167. *out_high64 = hi;
  168. *out_low64 = lo;
  169. }
  170. void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
  171. uint64_t* out_high64, uint64_t* out_low64) {
  172. BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
  173. }
  174. void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
  175. uint64_t* out_high64, uint64_t* out_low64) {
  176. BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
  177. }
  178. } // namespace ROCKSDB_NAMESPACE