crc32c_arm64.cc 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. // Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "util/crc32c_arm64.h"
  6. #if defined(HAVE_ARM64_CRC)
  7. #if defined(__linux__)
  8. #include <asm/hwcap.h>
  9. #endif
  10. #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
  11. #include <sys/auxv.h>
  12. #endif
  13. #ifndef HWCAP_CRC32
  14. #define HWCAP_CRC32 (1 << 7)
  15. #endif
  16. #ifndef HWCAP_PMULL
  17. #define HWCAP_PMULL (1 << 4)
  18. #endif
  19. #if defined(__APPLE__)
  20. #include <sys/sysctl.h>
  21. #endif
  22. #if defined(__OpenBSD__)
  23. #include <machine/armreg.h>
  24. #include <machine/cpu.h>
  25. #include <sys/sysctl.h>
  26. #include <sys/types.h>
  27. #endif
  28. #ifdef HAVE_ARM64_CRYPTO
  29. /* unfolding to compute 8 * 3 = 24 bytes parallelly */
  30. #define CRC32C24BYTES(ITR) \
  31. crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR))); \
  32. crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
  33. crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
  34. /* unfolding to compute 24 * 7 = 168 bytes parallelly */
  35. #define CRC32C7X24BYTES(ITR) \
  36. do { \
  37. CRC32C24BYTES((ITR) * 7 + 0) \
  38. CRC32C24BYTES((ITR) * 7 + 1) \
  39. CRC32C24BYTES((ITR) * 7 + 2) \
  40. CRC32C24BYTES((ITR) * 7 + 3) \
  41. CRC32C24BYTES((ITR) * 7 + 4) \
  42. CRC32C24BYTES((ITR) * 7 + 5) \
  43. CRC32C24BYTES((ITR) * 7 + 6) \
  44. } while (0)
  45. #endif
  46. extern bool pmull_runtime_flag;
  47. uint32_t crc32c_runtime_check(void) {
  48. #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
  49. uint64_t auxv = 0;
  50. #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
  51. auxv = getauxval(AT_HWCAP);
  52. #elif defined(__FreeBSD__)
  53. elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
  54. #endif
  55. return (auxv & HWCAP_CRC32) != 0;
  56. #elif defined(__APPLE__)
  57. int r;
  58. size_t l = sizeof(r);
  59. if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0;
  60. return r == 1;
  61. #elif defined(__OpenBSD__)
  62. int r = 0;
  63. const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0};
  64. uint64_t isar0;
  65. size_t len = sizeof(isar0);
  66. if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
  67. if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) r = 1;
  68. }
  69. return r;
  70. #else
  71. return 0;
  72. #endif
  73. }
  74. bool crc32c_pmull_runtime_check(void) {
  75. #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
  76. uint64_t auxv = 0;
  77. #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
  78. auxv = getauxval(AT_HWCAP);
  79. #elif defined(__FreeBSD__)
  80. elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
  81. #endif
  82. return (auxv & HWCAP_PMULL) != 0;
  83. #elif defined(__APPLE__)
  84. return true;
  85. #elif defined(__OpenBSD__)
  86. bool r = false;
  87. const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0};
  88. uint64_t isar0;
  89. size_t len = sizeof(isar0);
  90. if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
  91. if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) r = true;
  92. }
  93. return r;
  94. #else
  95. return false;
  96. #endif
  97. }
  98. #ifdef ROCKSDB_UBSAN_RUN
  99. #if defined(__clang__)
  100. __attribute__((__no_sanitize__("alignment")))
  101. #elif defined(__GNUC__)
  102. __attribute__((__no_sanitize_undefined__))
  103. #endif
  104. #endif
  105. uint32_t
  106. crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
  107. const uint8_t *buf8;
  108. const uint64_t *buf64 = (uint64_t *)data;
  109. int length = (int)len;
  110. crc ^= 0xffffffff;
  111. /*
  112. * Pmull runtime check here.
  113. * Raspberry Pi supports crc32 but doesn't support pmull.
  114. * Skip Crc32c Parallel computation if no crypto extension available.
  115. */
  116. if (pmull_runtime_flag) {
  117. /* Macro (HAVE_ARM64_CRYPTO) is used for compiling check */
  118. #ifdef HAVE_ARM64_CRYPTO
  119. /* Crc32c Parallel computation
  120. * Algorithm comes from Intel whitepaper:
  121. * crc-iscsi-polynomial-crc32-instruction-paper
  122. *
  123. * Input data is divided into three equal-sized blocks
  124. * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
  125. * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
  126. */
  127. #define BLK_LENGTH 42
  128. while (length >= 1024) {
  129. uint64_t t0, t1;
  130. uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
  131. /* Parallel Param:
  132. * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
  133. * k1 = CRC32(x ^ (42 * 8 * 8 - 1));
  134. */
  135. uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
  136. /* Prefetch data for following block to avoid cache miss */
  137. PREF1KL1((uint8_t *)buf64, 1024);
  138. /* First 8 byte for better pipelining */
  139. crc0 = crc32c_u64(crc, *buf64++);
  140. /* 3 blocks crc32c parallel computation
  141. * Macro unfolding to compute parallelly
  142. * 168 * 6 = 1008 (bytes)
  143. */
  144. CRC32C7X24BYTES(0);
  145. CRC32C7X24BYTES(1);
  146. CRC32C7X24BYTES(2);
  147. CRC32C7X24BYTES(3);
  148. CRC32C7X24BYTES(4);
  149. CRC32C7X24BYTES(5);
  150. buf64 += (BLK_LENGTH * 3);
  151. /* Last 8 bytes */
  152. crc = crc32c_u64(crc2, *buf64++);
  153. t0 = (uint64_t)vmull_p64(crc0, k0);
  154. t1 = (uint64_t)vmull_p64(crc1, k1);
  155. /* Merge (crc0, crc1, crc2) -> crc */
  156. crc1 = crc32c_u64(0, t1);
  157. crc ^= crc1;
  158. crc0 = crc32c_u64(0, t0);
  159. crc ^= crc0;
  160. length -= 1024;
  161. }
  162. if (length == 0) return crc ^ (0xffffffffU);
  163. #endif
  164. } // if Pmull runtime check here
  165. buf8 = (const uint8_t *)buf64;
  166. while (length >= 8) {
  167. crc = crc32c_u64(crc, *(const uint64_t *)buf8);
  168. buf8 += 8;
  169. length -= 8;
  170. }
  171. /* The following is more efficient than the straight loop */
  172. if (length >= 4) {
  173. crc = crc32c_u32(crc, *(const uint32_t *)buf8);
  174. buf8 += 4;
  175. length -= 4;
  176. }
  177. if (length >= 2) {
  178. crc = crc32c_u16(crc, *(const uint16_t *)buf8);
  179. buf8 += 2;
  180. length -= 2;
  181. }
  182. if (length >= 1) crc = crc32c_u8(crc, *buf8);
  183. crc ^= 0xffffffff;
  184. return crc;
  185. }
  186. #endif