crc32c_arm64.cc 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. // Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "util/crc32c_arm64.h"
  6. #if defined(__linux__) && defined(HAVE_ARM64_CRC)
  7. #include <asm/hwcap.h>
  8. #include <sys/auxv.h>
  9. #ifndef HWCAP_CRC32
  10. #define HWCAP_CRC32 (1 << 7)
  11. #endif
  12. #ifdef HAVE_ARM64_CRYPTO
  13. /* unfolding to compute 8 * 3 = 24 bytes parallelly */
  14. #define CRC32C24BYTES(ITR) \
  15. crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR))); \
  16. crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
  17. crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
  18. /* unfolding to compute 24 * 7 = 168 bytes parallelly */
  19. #define CRC32C7X24BYTES(ITR) \
  20. do { \
  21. CRC32C24BYTES((ITR)*7 + 0) \
  22. CRC32C24BYTES((ITR)*7 + 1) \
  23. CRC32C24BYTES((ITR)*7 + 2) \
  24. CRC32C24BYTES((ITR)*7 + 3) \
  25. CRC32C24BYTES((ITR)*7 + 4) \
  26. CRC32C24BYTES((ITR)*7 + 5) \
  27. CRC32C24BYTES((ITR)*7 + 6) \
  28. } while (0)
  29. #endif
  30. uint32_t crc32c_runtime_check(void) {
  31. uint64_t auxv = getauxval(AT_HWCAP);
  32. return (auxv & HWCAP_CRC32) != 0;
  33. }
  34. uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
  35. unsigned len) {
  36. const uint8_t *buf8;
  37. const uint64_t *buf64 = (uint64_t *)data;
  38. int length = (int)len;
  39. crc ^= 0xffffffff;
  40. #ifdef HAVE_ARM64_CRYPTO
  41. /* Crc32c Parallel computation
  42. * Algorithm comes from Intel whitepaper:
  43. * crc-iscsi-polynomial-crc32-instruction-paper
  44. *
  45. * Input data is divided into three equal-sized blocks
  46. * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
  47. * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
  48. */
  49. #define BLK_LENGTH 42
  50. while (length >= 1024) {
  51. uint64_t t0, t1;
  52. uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
  53. /* Parallel Param:
  54. * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
  55. * k1 = CRC32(x ^ (42 * 8 * 8 - 1));
  56. */
  57. uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
  58. /* Prefetch data for following block to avoid cache miss */
  59. PREF1KL1((uint8_t *)buf64, 1024);
  60. /* First 8 byte for better pipelining */
  61. crc0 = crc32c_u64(crc, *buf64++);
  62. /* 3 blocks crc32c parallel computation
  63. * Macro unfolding to compute parallelly
  64. * 168 * 6 = 1008 (bytes)
  65. */
  66. CRC32C7X24BYTES(0);
  67. CRC32C7X24BYTES(1);
  68. CRC32C7X24BYTES(2);
  69. CRC32C7X24BYTES(3);
  70. CRC32C7X24BYTES(4);
  71. CRC32C7X24BYTES(5);
  72. buf64 += (BLK_LENGTH * 3);
  73. /* Last 8 bytes */
  74. crc = crc32c_u64(crc2, *buf64++);
  75. t0 = (uint64_t)vmull_p64(crc0, k0);
  76. t1 = (uint64_t)vmull_p64(crc1, k1);
  77. /* Merge (crc0, crc1, crc2) -> crc */
  78. crc1 = crc32c_u64(0, t1);
  79. crc ^= crc1;
  80. crc0 = crc32c_u64(0, t0);
  81. crc ^= crc0;
  82. length -= 1024;
  83. }
  84. if (length == 0) return crc ^ (0xffffffffU);
  85. #endif
  86. buf8 = (const uint8_t *)buf64;
  87. while (length >= 8) {
  88. crc = crc32c_u64(crc, *(const uint64_t *)buf8);
  89. buf8 += 8;
  90. length -= 8;
  91. }
  92. /* The following is more efficient than the straight loop */
  93. if (length >= 4) {
  94. crc = crc32c_u32(crc, *(const uint32_t *)buf8);
  95. buf8 += 4;
  96. length -= 4;
  97. }
  98. if (length >= 2) {
  99. crc = crc32c_u16(crc, *(const uint16_t *)buf8);
  100. buf8 += 2;
  101. length -= 2;
  102. }
  103. if (length >= 1) crc = crc32c_u8(crc, *buf8);
  104. crc ^= 0xffffffff;
  105. return crc;
  106. }
  107. #endif