mirror of
https://github.com/claunia/findcrcs.git
synced 2025-12-16 18:54:25 +00:00
Added version 0.2 from V.
This commit is contained in:
73
crcutil-1.0/code/base_types.h
Normal file
73
crcutil-1.0/code/base_types.h
Normal file
@@ -0,0 +1,73 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Defines 8/16/32/64-bit integer types.
|
||||
//
|
||||
// Either uint64 or uint32 will map to size_t.
|
||||
// This way, specialized variants of CRC implementation
|
||||
// parameterized by "size_t" will be reused when
|
||||
// parameterized by "uint64" or "uint32".
|
||||
// In their turn, specialized verisons are parameterized
|
||||
// by "size_t" so that one version of the code is optimal
|
||||
// both on 32-bit and 64-bit platforms.
|
||||
|
||||
#ifndef CRCUTIL_BASE_TYPES_H_
|
||||
#define CRCUTIL_BASE_TYPES_H_
|
||||
|
||||
#include "std_headers.h" // size_t, ptrdiff_t
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
template<typename A, typename B> class ChooseFirstIfSame {
|
||||
public:
|
||||
template<bool same_size, typename AA, typename BB> class ChooseFirstIfTrue {
|
||||
public:
|
||||
typedef AA Type;
|
||||
};
|
||||
template<typename AA, typename BB> class ChooseFirstIfTrue<false, AA, BB> {
|
||||
public:
|
||||
typedef BB Type;
|
||||
};
|
||||
|
||||
typedef typename ChooseFirstIfTrue<sizeof(A) == sizeof(B), A, B>::Type Type;
|
||||
};
|
||||
|
||||
typedef unsigned char uint8;
|
||||
typedef signed char int8;
|
||||
|
||||
typedef unsigned short uint16;
|
||||
typedef short int16;
|
||||
|
||||
typedef ChooseFirstIfSame<size_t, unsigned int>::Type uint32;
|
||||
typedef ChooseFirstIfSame<ptrdiff_t, int>::Type int32;
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
typedef ChooseFirstIfSame<size_t, unsigned __int64>::Type uint64;
|
||||
typedef ChooseFirstIfSame<ptrdiff_t, __int64>::Type int64;
|
||||
#define HAVE_UINT64 1
|
||||
#elif defined(__GNUC__)
|
||||
typedef ChooseFirstIfSame<size_t, unsigned long long>::Type uint64;
|
||||
typedef ChooseFirstIfSame<ptrdiff_t, long long>::Type int64;
|
||||
#define HAVE_UINT64 1
|
||||
#else
|
||||
// TODO: ensure that everything compiles and works when HAVE_UINT64 is false.
|
||||
// TODO: remove HAVE_UINT64 and use sizeof(uint64) instead?
|
||||
#define HAVE_UINT64 0
|
||||
typedef uint32 uint64;
|
||||
typedef int32 int64;
|
||||
#endif
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_BASE_TYPES_H_
|
||||
366
crcutil-1.0/code/crc32c_sse4.cc
Normal file
366
crcutil-1.0/code/crc32c_sse4.cc
Normal file
@@ -0,0 +1,366 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements CRC32C using Intel's SSE4 crc32 instruction.
|
||||
// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
|
||||
// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
|
||||
|
||||
#include "crc32c_sse4.h"
|
||||
|
||||
#if HAVE_I386 || HAVE_AMD64
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
|
||||
CRC_UPDATE_WORD(crc0, \
|
||||
reinterpret_cast<const size_t *>(src + \
|
||||
0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
|
||||
CRC_UPDATE_WORD(crc1, \
|
||||
reinterpret_cast<const size_t *>(src + \
|
||||
1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
|
||||
CRC_UPDATE_WORD(crc2, \
|
||||
reinterpret_cast<const size_t *>(src + \
|
||||
2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
|
||||
if (num_stripes > 3) { \
|
||||
CRC_UPDATE_WORD(crc3, \
|
||||
reinterpret_cast<const size_t *>(src + \
|
||||
3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)"
|
||||
// using appropriate multiplication table(s).
|
||||
//
|
||||
#if 0
|
||||
|
||||
// This variant is for illustration purposes only.
|
||||
// Actual implementation below:
|
||||
// 1. Splits the computation into 2 data-independent paths
|
||||
// by independently multiplying lower and upper halves
|
||||
// of "crc0" in interleaved manner, and combining the
|
||||
// results in the end.
|
||||
// 2. Removing redundant "crc0 = 0" etc. in the beginning.
|
||||
// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
|
||||
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
|
||||
size_t tmp0 = crc0; \
|
||||
crc0 = 0; \
|
||||
for (size_t i = 0; i < kNumTables; ++i) { \
|
||||
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[i][tmp0 & (kTableEntries - 1)]; \
|
||||
tmp0 >>= kTableEntryBits; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
|
||||
size_t tmp0 = crc0; \
|
||||
size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
|
||||
crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[0][tmp0 & (kTableEntries - 1)]; \
|
||||
tmp0 >>= kTableEntryBits; \
|
||||
size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
|
||||
tmp1 >>= kTableEntryBits; \
|
||||
for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
|
||||
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[i][tmp0 & (kTableEntries - 1)]; \
|
||||
tmp0 >>= kTableEntryBits; \
|
||||
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
|
||||
tmp1 >>= kTableEntryBits; \
|
||||
} \
|
||||
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
|
||||
if (kNumTables & 1) { \
|
||||
tmp0 >>= kTableEntryBits; \
|
||||
} \
|
||||
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[kNumTables - 1][tmp1]; \
|
||||
if (kNumTables & 1) { \
|
||||
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
|
||||
} \
|
||||
crc0 ^= crc1; \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
// Given CRCs (crc0, crc1, etc.) of consequitive
|
||||
// stripes of STRIPE_SIZE(block_size) bytes each,
|
||||
// produces CRC of concatenated stripes.
|
||||
#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
|
||||
MULTIPLY_CRC(crc0, block_size, num_stripes); \
|
||||
crc0 ^= crc1; \
|
||||
MULTIPLY_CRC(crc0, block_size, num_stripes); \
|
||||
crc0 ^= crc2; \
|
||||
if (num_stripes > 3) { \
|
||||
MULTIPLY_CRC(crc0, block_size, num_stripes); \
|
||||
crc0 ^= crc3; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Processes input BLOCK_SIZE(block) bytes per iteration
|
||||
// by splitting a block of BLOCK_SIZE(block) bytes into N
|
||||
// equally-sized stripes of STRIPE_SIZE(block_size) each,
|
||||
// computing CRC of each stripe, and concatenating stripe CRCs.
|
||||
#define PROCESS_BLOCK(block_size, num_stripes) do { \
|
||||
while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
|
||||
Crc crc1 = 0; \
|
||||
Crc crc2 = 0; \
|
||||
Crc crc3; \
|
||||
if (num_stripes > 3) crc3 = 0; \
|
||||
{ \
|
||||
const uint8 *stripe_end = src + \
|
||||
(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
|
||||
kUnrolledLoopBytes) * kUnrolledLoopBytes; \
|
||||
do { \
|
||||
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
|
||||
UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
|
||||
src += kUnrolledLoopBytes; \
|
||||
} while (src < stripe_end); \
|
||||
if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
|
||||
kUnrolledLoopBytes) != 0) { \
|
||||
stripe_end += \
|
||||
CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
|
||||
kUnrolledLoopBytes; \
|
||||
do { \
|
||||
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
|
||||
src += sizeof(size_t); \
|
||||
} while (src < stripe_end); \
|
||||
} \
|
||||
} \
|
||||
COMBINE_STRIPE_CRCS(block_size, num_stripes); \
|
||||
src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
|
||||
((num_stripes) - 1); \
|
||||
bytes = static_cast<size_t>(end - src); \
|
||||
} \
|
||||
no_more_##block_size##_##num_stripes:; \
|
||||
} while (0)
|
||||
|
||||
size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
crc0 ^= Base().Canonize();
|
||||
|
||||
// If we don't have too much data to process,
|
||||
// do not waste time trying to align input etc.
|
||||
// Noticeably improves performance on small inputs.
|
||||
if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
|
||||
if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
|
||||
if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
|
||||
|
||||
#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
|
||||
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
|
||||
goto no_more_##block_size##_##num_stripes; \
|
||||
} \
|
||||
} while (0)
|
||||
#define NOOP(block_size, num_stripes)
|
||||
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
|
||||
NOOP,
|
||||
NOOP);
|
||||
|
||||
#undef PROCESS_TAIL_IF_SMALL
|
||||
|
||||
|
||||
// Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
|
||||
// 1. It uses CRC_BYTE() which won't work.
|
||||
// 2. Its threshold may be incorrect becuase Crc32 that uses
|
||||
// native CPU crc32 instruction is much faster than
|
||||
// generic table-based CRC computation.
|
||||
//
|
||||
// In case of X5550 CPU, break even point is at 2KB -- exactly.
|
||||
if (bytes >= 2 * 1024) {
|
||||
while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
CRC_UPDATE_BYTE(crc0, src[0]);
|
||||
src += 1;
|
||||
}
|
||||
bytes = static_cast<size_t>(end - src);
|
||||
}
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Quickly skip processing of too large blocks
|
||||
// Noticeably improves performance on small inputs.
|
||||
#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
|
||||
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
|
||||
goto no_more_##block_size##_##num_stripes; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
|
||||
SKIP_BLOCK_IF_NEEDED,
|
||||
SKIP_BLOCK_IF_NEEDED);
|
||||
|
||||
#undef SKIP_BLOCK_IF_NEEDED
|
||||
|
||||
// Process data in all blocks.
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
|
||||
PROCESS_BLOCK,
|
||||
PROCESS_BLOCK);
|
||||
|
||||
// Finish the tail word-by-word and then byte-by-byte.
|
||||
#define CRC_UPDATE_WORD_4(index) do { \
|
||||
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
|
||||
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
|
||||
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
|
||||
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
|
||||
} while (0)
|
||||
|
||||
if (bytes >= 4 * 4 * sizeof(size_t)) {
|
||||
end -= 4 * 4 * sizeof(size_t);
|
||||
do {
|
||||
CRC_UPDATE_WORD_4(4 * 0);
|
||||
CRC_UPDATE_WORD_4(4 * 1);
|
||||
CRC_UPDATE_WORD_4(4 * 2);
|
||||
CRC_UPDATE_WORD_4(4 * 3);
|
||||
src += 4 * 4 * sizeof(size_t);
|
||||
} while (src <= end);
|
||||
end += 4 * 4 * sizeof(size_t);
|
||||
bytes = static_cast<size_t>(end - src);
|
||||
}
|
||||
less_than_16_size_t:
|
||||
|
||||
if (bytes >= 4 * 2 * sizeof(size_t)) {
|
||||
CRC_UPDATE_WORD_4(4 * 0);
|
||||
CRC_UPDATE_WORD_4(4 * 1);
|
||||
src += 4 * 2 * sizeof(size_t);
|
||||
bytes -= 4 * 2 * sizeof(size_t);
|
||||
}
|
||||
less_than_8_size_t:
|
||||
|
||||
if (bytes >= 4 * sizeof(size_t)) {
|
||||
CRC_UPDATE_WORD_4(0);
|
||||
src += 4 * sizeof(size_t);
|
||||
bytes -= 4 * sizeof(size_t);
|
||||
}
|
||||
less_than_4_size_t:
|
||||
|
||||
if (bytes >= 1 * sizeof(size_t)) {
|
||||
end -= 1 * sizeof(size_t);
|
||||
do {
|
||||
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
|
||||
src += 1 * sizeof(size_t);
|
||||
} while (src <= end);
|
||||
end += 1 * sizeof(size_t);
|
||||
}
|
||||
|
||||
while (src < end) {
|
||||
CRC_UPDATE_BYTE(crc0, src[0]);
|
||||
src += 1;
|
||||
}
|
||||
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
|
||||
void Crc32cSSE4::Init(bool constant) {
|
||||
base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
|
||||
|
||||
#define INIT_MUL_TABLE(block_size, num_stripes) do { \
|
||||
size_t multiplier = \
|
||||
Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
|
||||
for (size_t table = 0; table < kNumTables; ++table) { \
|
||||
for (size_t entry = 0; entry < kTableEntries; ++entry) { \
|
||||
size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
|
||||
CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
|
||||
static_cast<Entry>(Base().Multiply(value, multiplier)); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
|
||||
|
||||
#undef INIT_MUL_TABLE
|
||||
|
||||
#if !CRCUTIL_USE_MM_CRC32
|
||||
for (size_t j = 0; j < sizeof(Word); ++j) {
|
||||
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
|
||||
}
|
||||
}
|
||||
#endif // !CRCUTIL_USE_MM_CRC32
|
||||
}
|
||||
|
||||
|
||||
bool Crc32cSSE4::IsSSE42Available() {
|
||||
#if defined(_MSC_VER)
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info, 1);
|
||||
return ((cpu_info[3] & (1 << 20)) != 0);
|
||||
#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
|
||||
// Not using "cpuid.h" intentionally: it is missing from
|
||||
// too many installations.
|
||||
uint32 eax;
|
||||
uint32 ecx;
|
||||
uint32 edx;
|
||||
__asm__ volatile(
|
||||
#if HAVE_I386 && defined(__PIC__)
|
||||
"push ebx\n"
|
||||
"cpuid\n"
|
||||
"pop ebx\n"
|
||||
#else
|
||||
"cpuid\n"
|
||||
#endif // HAVE_I386 && defined(__PIC__)
|
||||
: "=a" (eax), "=c" (ecx), "=d" (edx)
|
||||
: "a" (1), "2" (0)
|
||||
: "%ebx"
|
||||
);
|
||||
return ((ecx & (1 << 20)) != 0);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
|
||||
size_t roll_window_bytes,
|
||||
const Crc &start_value) {
|
||||
crc_ = &crc;
|
||||
roll_window_bytes_ = roll_window_bytes;
|
||||
start_value_ = start_value;
|
||||
|
||||
Crc add = crc.Base().Canonize() ^ start_value;
|
||||
add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
|
||||
add ^= crc.Base().Canonize();
|
||||
Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
|
||||
add = crc.Base().Multiply(add, mul);
|
||||
|
||||
mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
out_[i] = static_cast<Entry>(
|
||||
crc.Base().MultiplyUnnormalized(
|
||||
static_cast<Crc>(i), 8, mul) ^ add);
|
||||
}
|
||||
|
||||
#if !CRCUTIL_USE_MM_CRC32
|
||||
memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
|
||||
#endif // !CRCUTIL_USE_MM_CRC32
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // HAVE_I386 || HAVE_AMD64
|
||||
252
crcutil-1.0/code/crc32c_sse4.h
Normal file
252
crcutil-1.0/code/crc32c_sse4.h
Normal file
@@ -0,0 +1,252 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements CRC32C using Intel's SSE4 crc32 instruction.
|
||||
// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
|
||||
// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
|
||||
|
||||
#ifndef CRCUTIL_CRC32C_SSE4_H_
|
||||
#define CRCUTIL_CRC32C_SSE4_H_
|
||||
|
||||
#include "gf_util.h" // base types, gf_util class, etc.
|
||||
#include "crc32c_sse4_intrin.h" // _mm_crc32_u* intrinsics
|
||||
|
||||
#if HAVE_I386 || HAVE_AMD64
|
||||
|
||||
#if CRCUTIL_USE_MM_CRC32
|
||||
|
||||
#if HAVE_I386
|
||||
#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u32(crc, (value)))
|
||||
#else
|
||||
#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u64(crc, (value)))
|
||||
#endif // HAVE_I386
|
||||
|
||||
#define CRC_UPDATE_BYTE(crc, value) \
|
||||
(crc = _mm_crc32_u8(static_cast<uint32>(crc), static_cast<uint8>(value)))
|
||||
|
||||
#else
|
||||
|
||||
#include "generic_crc.h"
|
||||
|
||||
#define CRC_UPDATE_WORD(crc, value) do { \
|
||||
size_t buf = (value); \
|
||||
CRC_WORD(this, crc, buf); \
|
||||
} while (0)
|
||||
#define CRC_UPDATE_BYTE(crc, value) do { \
|
||||
CRC_BYTE(this, crc, (value)); \
|
||||
} while (0)
|
||||
|
||||
#endif // CRCUTIL_USE_MM_CRC32
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#pragma pack(push, 16)
|
||||
|
||||
// Since the same pieces should be parameterized in many different places
|
||||
// and we do not want to introduce a mistake which is rather hard to find,
|
||||
// use a macro to enumerate all block sizes.
|
||||
//
|
||||
// Block sizes and number of stripes were tuned for best performance.
|
||||
//
|
||||
// All constants should be literal constants (too lazy to fix the macro).
|
||||
//
|
||||
// The use of different "macro_first", "macro", and "macro_last"
|
||||
// allows generation of different code for smallest, in between,
|
||||
// and largest block sizes.
|
||||
//
|
||||
// This macro shall be kept in sync with
|
||||
// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING.
|
||||
// Failure to do so will cause compile-time error.
|
||||
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING( \
|
||||
macro_smallest, macro, macro_largest) \
|
||||
macro_smallest(512, 3); \
|
||||
macro(1024, 3); \
|
||||
macro(4096, 3); \
|
||||
macro_largest(32768, 3)
|
||||
|
||||
// This macro shall be kept in sync with
|
||||
// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING.
|
||||
// Failure to do so will cause compile-time error.
|
||||
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING( \
|
||||
macro_smallest, macro, macro_largest) \
|
||||
macro_largest(32768, 3); \
|
||||
macro(4096, 3); \
|
||||
macro(1024, 3); \
|
||||
macro_smallest(512, 3)
|
||||
|
||||
// Enumerates all block sizes.
|
||||
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(macro) \
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(macro, macro, macro)
|
||||
|
||||
#define CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) \
|
||||
(((block_size) / (num_stripes)) & ~(sizeof(size_t) - 1))
|
||||
|
||||
#define CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes) \
|
||||
(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * (num_stripes))
|
||||
|
||||
#define CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
mul_table_##block_size##_##num_blocks##_
|
||||
|
||||
class RollingCrc32cSSE4;
|
||||
|
||||
class Crc32cSSE4 {
|
||||
public:
|
||||
// Exports Crc, TableEntry, and Word (needed by RollingCrc).
|
||||
typedef size_t Crc;
|
||||
typedef Crc Word;
|
||||
typedef Crc TableEntry;
|
||||
|
||||
Crc32cSSE4() {}
|
||||
|
||||
// Initializes the tables given generating polynomial of degree (degree).
|
||||
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
|
||||
// after actual CRC computation.
|
||||
explicit Crc32cSSE4(bool canonical) {
|
||||
Init(canonical);
|
||||
}
|
||||
void Init(bool canonical);
|
||||
|
||||
// Initializes the tables given generating polynomial of degree.
|
||||
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
|
||||
// after actual CRC computation.
|
||||
// Provided for compatibility with GenericCrc.
|
||||
Crc32cSSE4(const Crc &generating_polynomial,
|
||||
size_t degree,
|
||||
bool canonical) {
|
||||
Init(generating_polynomial, degree, canonical);
|
||||
}
|
||||
void Init(const Crc &generating_polynomial,
|
||||
size_t degree,
|
||||
bool canonical) {
|
||||
if (generating_polynomial == FixedGeneratingPolynomial() &&
|
||||
degree == FixedDegree()) {
|
||||
Init(canonical);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns fixed generating polymonial the class implements.
|
||||
static Crc FixedGeneratingPolynomial() {
|
||||
return 0x82f63b78;
|
||||
}
|
||||
|
||||
// Returns degree of fixed generating polymonial the class implements.
|
||||
static Crc FixedDegree() {
|
||||
return 32;
|
||||
}
|
||||
|
||||
// Returns base class.
|
||||
const GfUtil<Crc> &Base() const { return base_; }
|
||||
|
||||
// Computes CRC32.
|
||||
size_t CrcDefault(const void *data, size_t bytes, const Crc &crc) const {
|
||||
return Crc32c(data, bytes, crc);
|
||||
}
|
||||
|
||||
// Returns true iff crc32 instruction is available.
|
||||
static bool IsSSE42Available();
|
||||
|
||||
protected:
|
||||
// Actual implementation.
|
||||
size_t Crc32c(const void *data, size_t bytes, Crc crc) const;
|
||||
|
||||
enum {
|
||||
kTableEntryBits = 8,
|
||||
kTableEntries = 1 << kTableEntryBits,
|
||||
kNumTables = (32 + kTableEntryBits - 1) / kTableEntryBits,
|
||||
kNumTablesHalfLo = kNumTables / 2,
|
||||
kNumTablesHalfHi = (kNumTables + 1) / 2,
|
||||
|
||||
kUnrolledLoopCount = 8,
|
||||
kUnrolledLoopBytes = kUnrolledLoopCount * sizeof(size_t),
|
||||
};
|
||||
|
||||
// May be set to size_t or uint32, whichever is faster.
|
||||
typedef uint32 Entry;
|
||||
|
||||
#define DECLARE_MUL_TABLE(block_size, num_stripes) \
|
||||
Entry CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
|
||||
[kNumTables][kTableEntries]
|
||||
|
||||
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(DECLARE_MUL_TABLE);
|
||||
|
||||
#undef DECLARE_MUL_TABLE
|
||||
|
||||
GfUtil<Crc> base_;
|
||||
|
||||
#if !CRCUTIL_USE_MM_CRC32
|
||||
TableEntry crc_word_[sizeof(Word)][256];
|
||||
friend class RollingCrc32cSSE4;
|
||||
#endif // !CRCUTIL_USE_MM_CRC32
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
class RollingCrc32cSSE4 {
|
||||
public:
|
||||
typedef Crc32cSSE4::Crc Crc;
|
||||
typedef Crc32cSSE4::TableEntry TableEntry;
|
||||
typedef Crc32cSSE4::Word Word;
|
||||
|
||||
RollingCrc32cSSE4() {}
|
||||
|
||||
// Initializes internal data structures.
|
||||
// Retains reference to "crc" instance -- it is used by Start().
|
||||
RollingCrc32cSSE4(const Crc32cSSE4 &crc,
|
||||
size_t roll_window_bytes,
|
||||
const Crc &start_value) {
|
||||
Init(crc, roll_window_bytes, start_value);
|
||||
}
|
||||
void Init(const Crc32cSSE4 &crc,
|
||||
size_t roll_window_bytes,
|
||||
const Crc &start_value);
|
||||
|
||||
// Computes crc of "roll_window_bytes" using
|
||||
// "start_value" of "crc" (see Init()).
|
||||
Crc Start(const void *data) const {
|
||||
return crc_->CrcDefault(data, roll_window_bytes_, start_value_);
|
||||
}
|
||||
|
||||
// Computes CRC of "roll_window_bytes" starting in next position.
|
||||
Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const {
|
||||
Crc crc = old_crc;
|
||||
CRC_UPDATE_BYTE(crc, byte_in);
|
||||
crc ^= out_[byte_out];
|
||||
return crc;
|
||||
}
|
||||
|
||||
// Returns start value.
|
||||
Crc StartValue() const { return start_value_; }
|
||||
|
||||
// Returns length of roll window.
|
||||
size_t WindowBytes() const { return roll_window_bytes_; }
|
||||
|
||||
protected:
|
||||
typedef Crc Entry;
|
||||
Entry out_[256];
|
||||
|
||||
// Used only by Start().
|
||||
Crc start_value_;
|
||||
const Crc32cSSE4 *crc_;
|
||||
size_t roll_window_bytes_;
|
||||
|
||||
#if !CRCUTIL_USE_MM_CRC32
|
||||
TableEntry crc_word_[sizeof(Word)][256];
|
||||
#endif // !CRCUTIL_USE_MM_CRC32
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // HAVE_I386 || HAVE_AMD64
|
||||
|
||||
#endif // CRCUTIL_CRC32C_SSE4_H_
|
||||
99
crcutil-1.0/code/crc32c_sse4_intrin.h
Normal file
99
crcutil-1.0/code/crc32c_sse4_intrin.h
Normal file
@@ -0,0 +1,99 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Provides _mm_crc32_u64/32/8 intrinsics.
|
||||
|
||||
#ifndef CRCUTIL_CRC32C_SSE4_INTRIN_H_
|
||||
#define CRCUTIL_CRC32C_SSE4_INTRIN_H_
|
||||
|
||||
#include "platform.h"
|
||||
#include "base_types.h"
|
||||
|
||||
#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64)
|
||||
|
||||
#if defined(_MSC_VER) || defined(__SSE4_2__)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
// '_M_IA64' is not defined as a preprocessor macro
|
||||
#pragma warning(disable: 4668)
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
#include <nmmintrin.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
#elif GCC_VERSION_AVAILABLE(4, 5) && !defined(CRCUTIL_FORCE_ASM_CRC32C)
|
||||
// Allow the use of _mm_crc32_u* intrinsic when CRCUTIL_USE_MM_CRC32
|
||||
// is set irrespective of "-msse*" settings. This way, the sources
|
||||
// may be compiled with "-msse2 -mcrc32" and work on older CPUs,
|
||||
// while taking full advantage of "crc32" instruction on newer
|
||||
// CPUs (requires dynamic CPU detection). See "interface.cc".
|
||||
//
|
||||
// If neither -msse4 or -mcrc32 is provided and CRCUTIL_USE_MM_CRC32 is set
|
||||
// and CRCUTIL_FORCE_ASM_CRC32 is not set, compile-time error will happen.
|
||||
// Why? Becuase GCC disables __builtin_ia32_crc32* intrinsics when compiled
|
||||
// without -msse4 or -mcrc32. -msse4 could be detected at run time by checking
|
||||
// whether __SSE4_2__ is defined, but there is no way to tell whether the
|
||||
// sources are compiled with -mcrc32.
|
||||
|
||||
extern __inline unsigned int __attribute__((
|
||||
__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_crc32_u8(unsigned int __C, unsigned char __V) {
|
||||
return __builtin_ia32_crc32qi(__C, __V);
|
||||
}
|
||||
#ifdef __x86_64__
|
||||
extern __inline unsigned long long __attribute__((
|
||||
__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_crc32_u64(unsigned long long __C, unsigned long long __V) {
|
||||
return __builtin_ia32_crc32di(__C, __V);
|
||||
}
|
||||
#else
|
||||
extern __inline unsigned int __attribute__((
|
||||
__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_crc32_u32(unsigned int __C, unsigned int __V) {
|
||||
return __builtin_ia32_crc32si (__C, __V);
|
||||
}
|
||||
#endif // __x86_64__
|
||||
|
||||
#else
|
||||
|
||||
// GCC 4.4.x and earlier: use inline asm.
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
__forceinline uint64 _mm_crc32_u64(uint64 crc, uint64 value) {
|
||||
asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
||||
return crc;
|
||||
}
|
||||
|
||||
__forceinline uint32 _mm_crc32_u32(uint32 crc, uint64 value) {
|
||||
asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
||||
return crc;
|
||||
}
|
||||
|
||||
__forceinline uint32 _mm_crc32_u8(uint32 crc, uint8 value) {
|
||||
asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
||||
return crc;
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif
|
||||
|
||||
#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64)
|
||||
|
||||
#endif // CRCUTIL_CRC32C_SSE4_INTRIN_H_
|
||||
68
crcutil-1.0/code/crc_casts.h
Normal file
68
crcutil-1.0/code/crc_casts.h
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Casting between integers and compound CRC types.
|
||||
|
||||
#ifndef CRCUTIL_CRC_CASTS_H_
|
||||
#define CRCUTIL_CRC_CASTS_H_
|
||||
|
||||
#include "base_types.h" // uint8, uint64
|
||||
#include "platform.h" // __forceinline
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
// Downcasts a value of (oftentimes larger) Crc type to (smaller base integer)
|
||||
// Result type, enabling specialized downcasts implemented by "large integer"
|
||||
// classes (e.g. uint128_sse2).
|
||||
template<typename Crc, typename Result>
|
||||
__forceinline Result Downcast(const Crc &x) {
|
||||
return static_cast<Result>(x);
|
||||
}
|
||||
|
||||
// Extracts 8 least significant bits from a value of Crc type.
|
||||
#define TO_BYTE(x) Downcast<Crc, uint8>(x)
|
||||
|
||||
// Converts a pair of uint64 bit values into single value of CRC type.
|
||||
// It is caller's responsibility to ensure that the input is correct.
|
||||
template<typename Crc>
|
||||
__forceinline Crc CrcFromUint64(uint64 lo, uint64 hi = 0) {
|
||||
if (sizeof(Crc) <= sizeof(lo)) {
|
||||
return static_cast<Crc>(lo);
|
||||
} else {
|
||||
// static_cast to keep compiler happy.
|
||||
Crc result = static_cast<Crc>(hi);
|
||||
result = SHIFT_LEFT_SAFE(result, 8 * sizeof(lo));
|
||||
result ^= lo;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Converts Crc value to a pair of uint64 values.
|
||||
template<typename Crc>
|
||||
__forceinline void Uint64FromCrc(const Crc &crc,
|
||||
uint64 *lo, uint64 *hi = NULL) {
|
||||
if (sizeof(*lo) >= sizeof(crc)) {
|
||||
*lo = Downcast<Crc, uint64>(crc);
|
||||
if (hi != NULL) {
|
||||
*hi = 0;
|
||||
}
|
||||
} else {
|
||||
*lo = Downcast<Crc, uint64>(crc);
|
||||
*hi = Downcast<Crc, uint64>(SHIFT_RIGHT_SAFE(crc, 8 * sizeof(lo)));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_CRC_CASTS_H_
|
||||
687
crcutil-1.0/code/generic_crc.h
Normal file
687
crcutil-1.0/code/generic_crc.h
Normal file
@@ -0,0 +1,687 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Defines GenericCrc class which implements arbitrary CRCs.
|
||||
//
|
||||
// Please read crc.pdf to understand how it all works.
|
||||
|
||||
#ifndef CRCUTIL_GENERIC_CRC_H_
|
||||
#define CRCUTIL_GENERIC_CRC_H_
|
||||
|
||||
#include "base_types.h" // uint8
|
||||
#include "crc_casts.h" // TO_BYTE(), Downcast<>.
|
||||
#include "gf_util.h" // GfUtil<Crc> class.
|
||||
#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16)
|
||||
#include "uint128_sse2.h" // uint128_sse2 type (if necessary)
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#pragma pack(push, 16)
|
||||
|
||||
// Extends CRC by one byte.
|
||||
// Technically, if degree of a polynomial does not exceed 8,
|
||||
// right shift by 8 bits is not required, but who cares about CRC-8?
|
||||
#define CRC_BYTE(table, crc, byte) do { \
|
||||
crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \
|
||||
table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \
|
||||
} while (0)
|
||||
|
||||
#define TABLE_ENTRY(table, byte, buf) \
|
||||
table[byte][Downcast<Word, uint8>(buf)]
|
||||
|
||||
#define TABLE_ENTRY_LAST(table, buf) \
|
||||
table[sizeof(Word) - 1][buf]
|
||||
|
||||
// Extends CRC by one word.
|
||||
#define CRC_WORD(table, crc, buf) do { \
|
||||
buf ^= Downcast<Crc, Word>(crc); \
|
||||
if (sizeof(crc) > sizeof(buf)) { \
|
||||
crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \
|
||||
crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \
|
||||
} else { \
|
||||
crc = TABLE_ENTRY(table->crc_word_, 0, buf); \
|
||||
} \
|
||||
buf >>= 8; \
|
||||
for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \
|
||||
crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \
|
||||
buf >>= 8; \
|
||||
} \
|
||||
crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \
|
||||
} while (0)
|
||||
|
||||
// Process beginning of data block byte by byte until source pointer
|
||||
// becomes perfectly aligned on Word boundary.
|
||||
#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \
|
||||
while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \
|
||||
if (src >= end) { \
|
||||
return (crc ^ table->Base().Canonize()); \
|
||||
} \
|
||||
CRC_BYTE(table, crc, *src); \
|
||||
src += 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks
|
||||
// but 6-10% faster on larger blocks (>= 2KB).
|
||||
// Break-even point (+-1%) is around 1KB (Q9650, E6600).
|
||||
//
|
||||
#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \
|
||||
do { \
|
||||
if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \
|
||||
ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4127) // conditional expression is constant
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
// Forward declarations.
|
||||
template<typename CrcImplementation> class RollingCrc;
|
||||
|
||||
// Crc is the type used internally and to return values of N-bit CRC.
|
||||
// It should be at least as large as "TableEntry" and "Word" but
|
||||
// may be larger (e.g. for 16-bit CRC, TableEntry and Word may be
|
||||
// set to uint16 but Crc may be set to uint32).
|
||||
//
|
||||
// TableEntry is the type of values stored in the tables.
|
||||
// To implement N-bit CRC, TableEntry should be large enough
|
||||
// to store N bits.
|
||||
//
|
||||
// Word is the type used to read data sizeof(Word) at a time.
|
||||
// Ideally, it shoulde be "most suitable for given architecture"
|
||||
// integer type -- typically "size_t".
|
||||
//
|
||||
// kStride is the number of words processed in interleaved manner by
|
||||
// CrcMultiword() and CrcWordblock(). Shall be either 3 or 4.
|
||||
// Optimal value depends on hardware architecture (AMD64, ARM, etc).
|
||||
//
|
||||
template<typename _Crc, typename _TableEntry, typename _Word, int kStride>
|
||||
class GenericCrc {
|
||||
public:
|
||||
// Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.)
|
||||
typedef _Crc Crc;
|
||||
typedef _TableEntry TableEntry;
|
||||
typedef _Word Word;
|
||||
|
||||
GenericCrc() {}
|
||||
|
||||
// Initializes the tables given generating polynomial of degree.
|
||||
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
|
||||
// after actual CRC computation.
|
||||
GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) {
|
||||
Init(generating_polynomial, degree, canonical);
|
||||
}
|
||||
void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
|
||||
base_.Init(generating_polynomial, degree, canonical);
|
||||
|
||||
// Instead of computing
|
||||
// table[j][i] = MultiplyUnnormalized(i, 8, k),
|
||||
// for all i = 0...255, we may notice that
|
||||
// if i = 2**n then for all m = 1...(i-1)
|
||||
// MultiplyUnnormalized(i + m, 8, k) =
|
||||
// MultiplyUnnormalized(i ^ m, 8, k) =
|
||||
// MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) =
|
||||
// MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] =
|
||||
// table[i] ^ table[m].
|
||||
#if 0
|
||||
for (size_t j = 0; j < sizeof(Word); ++j) {
|
||||
Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
|
||||
this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (size_t j = 0; j < sizeof(Word); ++j) {
|
||||
Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
|
||||
TableEntry *table = this->crc_word_interleaved_[j];
|
||||
table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
|
||||
for (size_t i = 1; i < 256; i <<= 1) {
|
||||
TableEntry value = Downcast<Crc, TableEntry>(
|
||||
Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
|
||||
table[i] = value;
|
||||
for (size_t m = 1; m < i; ++m) {
|
||||
table[i + m] = value ^ table[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
for (size_t j = 0; j < sizeof(Word); ++j) {
|
||||
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
|
||||
this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (size_t j = 0; j < sizeof(Word); ++j) {
|
||||
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
|
||||
TableEntry *table = this->crc_word_[j];
|
||||
table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
|
||||
for (size_t i = 1; i < 256; i <<= 1) {
|
||||
TableEntry value = Downcast<Crc, TableEntry>(
|
||||
Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
|
||||
table[i] = value;
|
||||
for (size_t m = 1; m < i; ++m) {
|
||||
table[i + m] = value ^ table[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Default CRC implementation
|
||||
Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const {
|
||||
#if HAVE_AMD64 || HAVE_I386
|
||||
return CrcMultiword(data, bytes, start);
|
||||
#else
|
||||
// Very few CPUs have multiple ALUs and speculative execution
|
||||
// (Itanium is an exception) so sophisticated algorithms will
|
||||
// not perform better than good old Sarwate algorithm.
|
||||
return CrcByteUnrolled(data, bytes, start);
|
||||
#endif // HAVE_AMD64 || HAVE_I386
|
||||
}
|
||||
|
||||
// Returns base class.
|
||||
const GfUtil<Crc> &Base() const { return base_; }
|
||||
|
||||
protected:
|
||||
// Canonical, byte-by-byte CRC computation.
|
||||
Crc CrcByte(const void *data, size_t bytes, const Crc &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
Crc crc = start ^ Base().Canonize();
|
||||
for (const uint8 *end = src + bytes; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Byte-by-byte CRC with main loop unrolled.
|
||||
Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const {
|
||||
if (bytes == 0) {
|
||||
return start;
|
||||
}
|
||||
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
Crc crc = start ^ Base().Canonize();
|
||||
|
||||
// Unroll loop 4 times.
|
||||
end -= 3;
|
||||
for (; src < end; src += 4) {
|
||||
PREFETCH(src);
|
||||
CRC_BYTE(this, crc, src[0]);
|
||||
CRC_BYTE(this, crc, src[1]);
|
||||
CRC_BYTE(this, crc, src[2]);
|
||||
CRC_BYTE(this, crc, src[3]);
|
||||
}
|
||||
end += 3;
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Canonical, byte-by-byte CRC computation.
|
||||
Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
Crc crc0 = start ^ Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Process 4*sizeof(Crc) bytes at a time.
|
||||
end -= 4 * sizeof(Crc) - 1;
|
||||
for (; src < end; src += 4 * sizeof(Crc)) {
|
||||
for (size_t i = 0; i < 4; ++i) {
|
||||
crc0 ^= reinterpret_cast<const Crc *>(src)[i];
|
||||
if (i == 0) {
|
||||
PREFETCH(src);
|
||||
}
|
||||
for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
|
||||
CRC_BYTE(this, crc0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
end += 4 * sizeof(Crc) - 1;
|
||||
|
||||
// Process sizeof(Crc) bytes at a time.
|
||||
end -= sizeof(Crc) - 1;
|
||||
for (; src < end; src += sizeof(Crc)) {
|
||||
crc0 ^= reinterpret_cast<const Crc *>(src)[0];
|
||||
for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
|
||||
CRC_BYTE(this, crc0, 0);
|
||||
}
|
||||
}
|
||||
end += sizeof(Crc) - 1;
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc0, *src);
|
||||
}
|
||||
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Faster, word-by-word CRC.
|
||||
Crc CrcWord(const void *data, size_t bytes, const Crc &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
Crc crc0 = start ^ Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Process 4 sizeof(Word) bytes at once.
|
||||
end -= 4 * sizeof(Word) - 1;
|
||||
for (; src < end; src += 4 * sizeof(Word)) {
|
||||
Word buf0 = reinterpret_cast<const Word *>(src)[0];
|
||||
PREFETCH(src);
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
buf0 = reinterpret_cast<const Word *>(src)[1];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
buf0 = reinterpret_cast<const Word *>(src)[2];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
buf0 = reinterpret_cast<const Word *>(src)[3];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
}
|
||||
end += 4 * sizeof(Word) - 1;
|
||||
|
||||
// Process sizeof(Word) bytes at a time.
|
||||
end -= sizeof(Word) - 1;
|
||||
for (; src < end; src += sizeof(Word)) {
|
||||
Word buf0 = reinterpret_cast<const Word *>(src)[0];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
}
|
||||
end += sizeof(Word) - 1;
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc0, *src);
|
||||
}
|
||||
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
#define REPEAT_FROM_1(macro) \
|
||||
macro(1); \
|
||||
macro(2); \
|
||||
macro(3); \
|
||||
macro(4); \
|
||||
macro(5); \
|
||||
macro(6); \
|
||||
macro(7);
|
||||
|
||||
#define REPEAT_FROM_0(macro) \
|
||||
macro(0); \
|
||||
REPEAT_FROM_1(macro)
|
||||
|
||||
// Faster, process adjusent blocks in parallel and concatenate CRCs.
|
||||
Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const {
|
||||
if (kStride < 2 || kStride > 8) {
|
||||
// Unsupported configuration;
|
||||
// fall back to something sensible.
|
||||
return CrcWord(data, bytes, start);
|
||||
}
|
||||
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
Crc crc0 = start ^ Base().Canonize();
|
||||
enum {
|
||||
// Add 16 to avoid false L1 cache collisions.
|
||||
kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1),
|
||||
};
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
end -= kStride * kStripe - 1;
|
||||
if (src < end) {
|
||||
Crc x_pow_8kStripe = Base().Xpow8N(kStripe);
|
||||
do {
|
||||
const uint8 *stripe_end = src + kStripe;
|
||||
|
||||
#define INIT_CRC(reg) \
|
||||
Crc crc##reg; \
|
||||
if (kStride >= reg) { \
|
||||
crc##reg = 0; \
|
||||
}
|
||||
REPEAT_FROM_1(INIT_CRC);
|
||||
#undef INIT_CRC
|
||||
|
||||
do {
|
||||
#define FIRST(reg) \
|
||||
Word buf##reg; \
|
||||
if (kStride > reg) { \
|
||||
buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \
|
||||
buf##reg ^= Downcast<Crc, Word>(crc##reg); \
|
||||
if (sizeof(crc##reg) > sizeof(buf##reg)) { \
|
||||
crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \
|
||||
crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
|
||||
} else { \
|
||||
crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
|
||||
} \
|
||||
buf##reg >>= 8; \
|
||||
}
|
||||
REPEAT_FROM_0(FIRST);
|
||||
#undef FIRST
|
||||
|
||||
for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) {
|
||||
#define NEXT(reg) do { \
|
||||
if (kStride > reg) { \
|
||||
crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \
|
||||
buf##reg >>= 8; \
|
||||
} \
|
||||
} while (0)
|
||||
REPEAT_FROM_0(NEXT);
|
||||
#undef NEXT
|
||||
}
|
||||
|
||||
#define LAST(reg) do { \
|
||||
if (kStride > reg) { \
|
||||
crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \
|
||||
} \
|
||||
} while (0)
|
||||
REPEAT_FROM_0(LAST);
|
||||
#undef LAST
|
||||
|
||||
src += sizeof(Word);
|
||||
} while (src < stripe_end);
|
||||
|
||||
#if 0
|
||||
// The code is left for illustrational purposes only.
|
||||
#define COMBINE(reg) do { \
|
||||
if (reg > 0 && kStride > reg) { \
|
||||
crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define COMBINE(reg) do { \
|
||||
if (reg > 0 && kStride > reg) { \
|
||||
crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
REPEAT_FROM_0(COMBINE);
|
||||
#undef COMBINE
|
||||
|
||||
src += (kStride - 1) * kStripe;
|
||||
}
|
||||
while (src < end);
|
||||
}
|
||||
end += kStride * kStripe - 1;
|
||||
|
||||
// Process sizeof(Word) bytes at a time.
|
||||
end -= sizeof(Word) - 1;
|
||||
for (; src < end; src += sizeof(Word)) {
|
||||
Word buf0 = reinterpret_cast<const Word *>(src)[0];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
}
|
||||
end += sizeof(Word) - 1;
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc0, *src);
|
||||
}
|
||||
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Fastest, interleaved multi-byte CRC.
|
||||
Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const {
|
||||
if (kStride < 2 || kStride > 8) {
|
||||
// Unsupported configuration;
|
||||
// fall back to something sensible.
|
||||
return CrcWord(data, bytes, start);
|
||||
}
|
||||
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
Crc crc0 = start ^ Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Process kStride Word registers at once;
|
||||
// should have have at least 2*kInterleaveBytes of data to start.
|
||||
end -= 2*kInterleaveBytes - 1;
|
||||
if (src < end) {
|
||||
Crc crc_carryover;
|
||||
if (sizeof(Crc) > sizeof(Word)) {
|
||||
// crc_carryover is used if and only if Crc is wider than Word.
|
||||
crc_carryover = 0;
|
||||
}
|
||||
#define INIT_CRC(reg) \
|
||||
Crc crc##reg; \
|
||||
if (reg > 0 && kStride > reg) { \
|
||||
crc##reg = 0; \
|
||||
}
|
||||
REPEAT_FROM_1(INIT_CRC);
|
||||
#undef INIT_CRC
|
||||
|
||||
#define INIT_BUF(reg) \
|
||||
Word buf##reg; \
|
||||
if (kStride > reg) { \
|
||||
buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
|
||||
}
|
||||
REPEAT_FROM_0(INIT_BUF);
|
||||
#undef INIT_BUF
|
||||
|
||||
do {
|
||||
PREFETCH(src);
|
||||
src += kInterleaveBytes;
|
||||
|
||||
if (sizeof(Crc) > sizeof(Word)) {
|
||||
crc0 ^= crc_carryover;
|
||||
}
|
||||
|
||||
#define FIRST(reg, next_reg) do { \
|
||||
if (kStride > reg) { \
|
||||
buf##reg ^= Downcast<Crc, Word>(crc##reg); \
|
||||
if (sizeof(Crc) > sizeof(Word)) { \
|
||||
if (reg < kStride - 1) { \
|
||||
crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
|
||||
} else { \
|
||||
crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
|
||||
} \
|
||||
} \
|
||||
crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \
|
||||
buf##reg >>= 8; \
|
||||
} \
|
||||
} while (0)
|
||||
FIRST(0, 1);
|
||||
FIRST(1, 2);
|
||||
FIRST(2, 3);
|
||||
FIRST(3, 4);
|
||||
FIRST(4, 5);
|
||||
FIRST(5, 6);
|
||||
FIRST(6, 7);
|
||||
FIRST(7, 0);
|
||||
#undef FIRST
|
||||
|
||||
for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) {
|
||||
#define NEXT(reg) do { \
|
||||
if (kStride > reg) { \
|
||||
crc##reg ^= \
|
||||
TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \
|
||||
buf##reg >>= 8; \
|
||||
} \
|
||||
} while(0)
|
||||
REPEAT_FROM_0(NEXT);
|
||||
#undef NEXT
|
||||
}
|
||||
|
||||
#define LAST(reg) do { \
|
||||
if (kStride > reg) { \
|
||||
crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \
|
||||
buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
|
||||
} \
|
||||
} while(0)
|
||||
REPEAT_FROM_0(LAST);
|
||||
#undef LAST
|
||||
}
|
||||
while (src < end);
|
||||
|
||||
if (sizeof(Crc) > sizeof(Word)) {
|
||||
crc0 ^= crc_carryover;
|
||||
}
|
||||
|
||||
#define COMBINE(reg) do { \
|
||||
if (kStride > reg) { \
|
||||
if (reg != 0) { \
|
||||
crc0 ^= crc##reg; \
|
||||
} \
|
||||
CRC_WORD(this, crc0, buf##reg); \
|
||||
} \
|
||||
} while (0)
|
||||
REPEAT_FROM_0(COMBINE);
|
||||
#undef COMBINE
|
||||
|
||||
src += kInterleaveBytes;
|
||||
}
|
||||
end += 2*kInterleaveBytes - 1;
|
||||
|
||||
// Process sizeof(Word) bytes at once.
|
||||
end -= sizeof(Word) - 1;
|
||||
for (; src < end; src += sizeof(Word)) {
|
||||
Word buf0 = reinterpret_cast<const Word *>(src)[0];
|
||||
CRC_WORD(this, crc0, buf0);
|
||||
}
|
||||
end += sizeof(Word) - 1;
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc0, *src);
|
||||
}
|
||||
|
||||
return (crc0 ^ Base().Canonize());
|
||||
}
|
||||
|
||||
protected:
|
||||
enum {
|
||||
kInterleaveBytes = sizeof(Word) * kStride,
|
||||
};
|
||||
|
||||
// Multiplication tables used by CRCs.
|
||||
TableEntry crc_word_interleaved_[sizeof(Word)][256];
|
||||
TableEntry crc_word_[sizeof(Word)][256];
|
||||
|
||||
// Base class stored after CRC tables so that the most frequently
|
||||
// used table is at offset 0 and may be accessed faster.
|
||||
GfUtil<Crc> base_;
|
||||
|
||||
friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >;
|
||||
|
||||
private:
|
||||
// CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is
|
||||
// noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on
|
||||
// hardware and compiler). However, there are problems with compilers.
|
||||
//
|
||||
// Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory.
|
||||
//
|
||||
// 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads:
|
||||
// CL 15.00.307291.1 C++ >1.2< CPU cycles/byte
|
||||
// ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte
|
||||
// GCC 4.5 -O3 C++ 2.0 CPU cycles/byte
|
||||
// GCC 4.x -O3 ASM >1.2< CPU cycles/byte
|
||||
//
|
||||
// 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads
|
||||
// CL 15.00.307291.1 C++ 2.0 CPU cycles/byte
|
||||
// GCC 4.5 -O3 C++ 1.9 CPU cycles/byte
|
||||
// ICL 11.1.051 -S C++ 1.6 CPU cycles/byte
|
||||
// GCC 4.x -O3 ASM >1.3< CPU cycles/byte
|
||||
//
|
||||
// So, use inline ASM code for GCC for both i386 and amd64.
|
||||
|
||||
Crc CrcMultiwordI386Mmx(
|
||||
const void *data, size_t bytes, const Crc &start) const;
|
||||
Crc CrcMultiwordGccAmd64(
|
||||
const void *data, size_t bytes, const Crc &start) const;
|
||||
Crc CrcMultiwordGccAmd64Sse2(
|
||||
const uint8 *src, const uint8 *end, const Crc &start) const;
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
#undef REPEAT_FROM_0
|
||||
#undef REPEAT_FROM_1
|
||||
|
||||
|
||||
// Specialized variants.
|
||||
#if CRCUTIL_USE_ASM
|
||||
|
||||
#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
|
||||
|
||||
// Declare specialized functions.
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
|
||||
const void *data, size_t bytes, const uint64 &start) const;
|
||||
|
||||
#if HAVE_AMD64 && HAVE_SSE2
|
||||
template<>
|
||||
uint128_sse2
|
||||
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
|
||||
const void *data, size_t bytes, const uint128_sse2 &start) const;
|
||||
#endif // HAVE_AMD64 && HAVE_SSE2
|
||||
|
||||
#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \
|
||||
(HAVE_I386 && HAVE_MMX)
|
||||
|
||||
// Work around bug in MSC (present at least in v. 15.00.30729.1)
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data,
|
||||
size_t bytes,
|
||||
const uint64 &start) const;
|
||||
template<> __forceinline
|
||||
uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
|
||||
const void *data,
|
||||
size_t bytes,
|
||||
const uint64 &start) const {
|
||||
typedef uint64 Word;
|
||||
typedef uint64 Crc;
|
||||
if (bytes <= 12) {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
uint64 crc = start ^ Base().Canonize();
|
||||
for (const uint8 *end = src + bytes; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
return CrcMultiwordI386Mmx(data, bytes, start);
|
||||
}
|
||||
|
||||
#endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
|
||||
|
||||
#endif // CRCUTIL_USE_ASM
|
||||
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_GENERIC_CRC_H_
|
||||
304
crcutil-1.0/code/gf_util.h
Normal file
304
crcutil-1.0/code/gf_util.h
Normal file
@@ -0,0 +1,304 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Defines GfUtil template class which implements
|
||||
// 1. some useful operations in GF(2^n),
|
||||
// 2. CRC helper function (e.g. concatenation of CRCs) which are
|
||||
// not affected by specific implemenation of CRC computation per se.
|
||||
//
|
||||
// Please read crc.pdf to understand how it all works.
|
||||
|
||||
#ifndef CRCUTIL_GF_UTIL_H_
|
||||
#define CRCUTIL_GF_UTIL_H_
|
||||
|
||||
#include "base_types.h" // uint8, uint64
|
||||
#include "crc_casts.h" // TO_BYTE()
|
||||
#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16), SHIFT_*_SAFE
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#pragma pack(push, 16)
|
||||
|
||||
// "Crc" is the type used internally and to return values of N-bit CRC.
|
||||
template<typename Crc> class GfUtil {
|
||||
public:
|
||||
// Initializes the tables given generating polynomial of degree (degree).
|
||||
// If "canonical" is true, starting CRC value and computed CRC value will be
|
||||
// XOR-ed with 111...111.
|
||||
GfUtil() {}
|
||||
GfUtil(const Crc &generating_polynomial, size_t degree, bool canonical) {
|
||||
Init(generating_polynomial, degree, canonical);
|
||||
}
|
||||
void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
|
||||
Crc one = 1;
|
||||
one <<= degree - 1;
|
||||
this->generating_polynomial_ = generating_polynomial;
|
||||
this->crc_bytes_ = (degree + 7) >> 3;
|
||||
this->degree_ = degree;
|
||||
this->one_ = one;
|
||||
if (canonical) {
|
||||
this->canonize_ = one | (one - 1);
|
||||
} else {
|
||||
this->canonize_ = 0;
|
||||
}
|
||||
this->normalize_[0] = 0;
|
||||
this->normalize_[1] = generating_polynomial;
|
||||
|
||||
Crc k = one >> 1;
|
||||
for (size_t i = 0; i < sizeof(uint64) * 8; ++i) {
|
||||
this->x_pow_2n_[i] = k;
|
||||
k = Multiply(k, k);
|
||||
}
|
||||
|
||||
this->crc_of_crc_ = Multiply(this->canonize_,
|
||||
this->one_ ^ Xpow8N((degree + 7) >> 3));
|
||||
|
||||
FindLCD(Xpow8N(this->crc_bytes_), &this->x_pow_minus_W_);
|
||||
}
|
||||
|
||||
// Returns generating polynomial.
|
||||
Crc GeneratingPolynomial() const {
|
||||
return this->generating_polynomial_;
|
||||
}
|
||||
|
||||
// Returns number of bits in CRC (degree of generating polynomial).
|
||||
size_t Degree() const {
|
||||
return this->degree_;
|
||||
}
|
||||
|
||||
// Returns start/finish adjustment constant.
|
||||
Crc Canonize() const {
|
||||
return this->canonize_;
|
||||
}
|
||||
|
||||
// Returns normalized value of 1.
|
||||
Crc One() const {
|
||||
return this->one_;
|
||||
}
|
||||
|
||||
// Returns value of CRC(A, |A|, start_new) given known
|
||||
// crc=CRC(A, |A|, start_old) -- without touching the data.
|
||||
Crc ChangeStartValue(const Crc &crc, uint64 bytes,
|
||||
const Crc &start_old,
|
||||
const Crc &start_new) const {
|
||||
return (crc ^ Multiply(start_new ^ start_old, Xpow8N(bytes)));
|
||||
}
|
||||
|
||||
// Returns CRC of concatenation of blocks A and B when CRCs
|
||||
// of blocks A and B are known -- without touching the data.
|
||||
//
|
||||
// To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0),
|
||||
// returns CRC(AB, |AB|, startA).
|
||||
Crc Concatenate(const Crc &crc_A, const Crc &crc_B, uint64 bytes_B) const {
|
||||
return ChangeStartValue(crc_B, bytes_B, 0 /* start_B */, crc_A);
|
||||
}
|
||||
|
||||
// Returns CRC of sequence of zeroes -- without touching the data.
|
||||
Crc CrcOfZeroes(uint64 bytes, const Crc &start) const {
|
||||
Crc tmp = Multiply(start ^ this->canonize_, Xpow8N(bytes));
|
||||
return (tmp ^ this->canonize_);
|
||||
}
|
||||
|
||||
// Given CRC of a message, stores extra (degree + 7)/8 bytes after
|
||||
// the message so that CRC(message+extra, start) = result.
|
||||
// Does not change CRC start value (use ChangeStartValue for that).
|
||||
// Returns number of stored bytes.
|
||||
size_t StoreComplementaryCrc(void *dst,
|
||||
const Crc &message_crc,
|
||||
const Crc &result) const {
|
||||
Crc crc0 = Multiply(result ^ this->canonize_, this->x_pow_minus_W_);
|
||||
crc0 ^= message_crc ^ this->canonize_;
|
||||
uint8 *d = reinterpret_cast<uint8 *>(dst);
|
||||
for (size_t i = 0; i < this->crc_bytes_; ++i) {
|
||||
d[i] = TO_BYTE(crc0);
|
||||
crc0 >>= 8;
|
||||
}
|
||||
return this->crc_bytes_;
|
||||
}
|
||||
|
||||
// Stores given CRC of a message as (degree + 7)/8 bytes filled
|
||||
// with 0s to the right. Returns number of stored bytes.
|
||||
// CRC of the message and stored CRC is a constant value returned
|
||||
// by CrcOfCrc() -- it does not depend on contents of the message.
|
||||
size_t StoreCrc(void *dst, const Crc &crc) const {
|
||||
uint8 *d = reinterpret_cast<uint8 *>(dst);
|
||||
Crc crc0 = crc;
|
||||
for (size_t i = 0; i < this->crc_bytes_; ++i) {
|
||||
d[i] = TO_BYTE(crc0);
|
||||
crc0 >>= 8;
|
||||
}
|
||||
return this->crc_bytes_;
|
||||
}
|
||||
|
||||
// Returns expected CRC value of CRC(Message,CRC(Message))
|
||||
// when CRC is stored after the message. This value is fixed
|
||||
// and does not depend on the message or CRC start value.
|
||||
Crc CrcOfCrc() const {
|
||||
return this->crc_of_crc_;
|
||||
}
|
||||
|
||||
// Returns ((a * b) mod P) where "a" and "b" are of degree <= (D-1).
|
||||
Crc Multiply(const Crc &aa, const Crc &bb) const {
|
||||
Crc a = aa;
|
||||
Crc b = bb;
|
||||
if ((a ^ (a - 1)) < (b ^ (b - 1))) {
|
||||
Crc temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
|
||||
if (a == 0) {
|
||||
return a;
|
||||
}
|
||||
|
||||
Crc product = 0;
|
||||
Crc one = this->one_;
|
||||
for (; a != 0; a <<= 1) {
|
||||
if ((a & one) != 0) {
|
||||
product ^= b;
|
||||
a ^= one;
|
||||
}
|
||||
b = (b >> 1) ^ this->normalize_[Downcast<Crc, size_t>(b & 1)];
|
||||
}
|
||||
|
||||
return product;
|
||||
}
|
||||
|
||||
// Returns ((unnorm * m) mod P) where degree of m is <= (D-1)
|
||||
// and degree of value "unnorm" is provided explicitly.
|
||||
Crc MultiplyUnnormalized(const Crc &unnorm, size_t degree,
|
||||
const Crc &m) const {
|
||||
Crc v = unnorm;
|
||||
Crc result = 0;
|
||||
while (degree > this->degree_) {
|
||||
degree -= this->degree_;
|
||||
Crc value = v & (this->one_ | (this->one_ - 1));
|
||||
result ^= Multiply(value, Multiply(m, XpowN(degree)));
|
||||
v >>= this->degree_;
|
||||
}
|
||||
result ^= Multiply(v << (this->degree_ - degree), m);
|
||||
return result;
|
||||
}
|
||||
|
||||
// returns ((x ** n) mod P).
|
||||
Crc XpowN(uint64 n) const {
|
||||
Crc one = this->one_;
|
||||
Crc result = one;
|
||||
|
||||
for (size_t i = 0; n != 0; ++i, n >>= 1) {
|
||||
if (n & 1) {
|
||||
result = Multiply(result, this->x_pow_2n_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns (x ** (8 * n) mod P).
|
||||
Crc Xpow8N(uint64 n) const {
|
||||
return XpowN(n << 3);
|
||||
}
|
||||
|
||||
// Returns remainder (A mod B) and sets *q = (A/B) of division
|
||||
// of two polynomials:
|
||||
// A = dividend + dividend_x_pow_D_coef * x**degree,
|
||||
// B = divisor.
|
||||
Crc Divide(const Crc ÷nd0, int dividend_x_pow_D_coef,
|
||||
const Crc &divisor0, Crc *q) const {
|
||||
Crc divisor = divisor0;
|
||||
Crc dividend = dividend0;
|
||||
Crc quotient = 0;
|
||||
Crc coef = this->one_;
|
||||
|
||||
while ((divisor & 1) == 0) {
|
||||
divisor >>= 1;
|
||||
coef >>= 1;
|
||||
}
|
||||
|
||||
if (dividend_x_pow_D_coef) {
|
||||
quotient = coef >> 1;
|
||||
dividend ^= divisor >> 1;
|
||||
}
|
||||
|
||||
Crc x_pow_degree_b = 1;
|
||||
for (;;) {
|
||||
if ((dividend & x_pow_degree_b) != 0) {
|
||||
dividend ^= divisor;
|
||||
quotient ^= coef;
|
||||
}
|
||||
if (coef == this->one_) {
|
||||
break;
|
||||
}
|
||||
coef <<= 1;
|
||||
x_pow_degree_b <<= 1;
|
||||
divisor <<= 1;
|
||||
}
|
||||
|
||||
*q = quotient;
|
||||
return dividend;
|
||||
}
|
||||
|
||||
// Extended Euclid's algorith -- for given A finds LCD(A, P) and
|
||||
// value B such that (A * B) mod P = LCD(A, P).
|
||||
Crc FindLCD(const Crc &A, Crc *B) const {
|
||||
if (A == 0 || A == this->one_) {
|
||||
*B = A;
|
||||
return A;
|
||||
}
|
||||
|
||||
// Actually, generating polynomial is
|
||||
// (generating_polynomial_ + x**degree).
|
||||
int r0_x_pow_D_coef = 1;
|
||||
Crc r0 = this->generating_polynomial_;
|
||||
Crc b0 = 0;
|
||||
Crc r1 = A;
|
||||
Crc b1 = this->one_;
|
||||
|
||||
for (;;) {
|
||||
Crc q;
|
||||
Crc r = Divide(r0, r0_x_pow_D_coef, r1, &q);
|
||||
if (r == 0) {
|
||||
break;
|
||||
}
|
||||
r0_x_pow_D_coef = 0;
|
||||
|
||||
r0 = r1;
|
||||
r1 = r;
|
||||
|
||||
Crc b = b0 ^ Multiply(q, b1);
|
||||
b0 = b1;
|
||||
b1 = b;
|
||||
}
|
||||
|
||||
*B = b1;
|
||||
return r1;
|
||||
}
|
||||
|
||||
protected:
|
||||
Crc canonize_;
|
||||
Crc x_pow_2n_[sizeof(uint64) * 8];
|
||||
Crc generating_polynomial_;
|
||||
Crc one_;
|
||||
Crc x_pow_minus_W_;
|
||||
Crc crc_of_crc_;
|
||||
Crc normalize_[2];
|
||||
size_t crc_bytes_;
|
||||
size_t degree_;
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_GF_UTIL_H_
|
||||
291
crcutil-1.0/code/multiword_128_64_gcc_amd64_sse2.cc
Normal file
291
crcutil-1.0/code/multiword_128_64_gcc_amd64_sse2.cc
Normal file
@@ -0,0 +1,291 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements multiword CRC for GCC on i386.
|
||||
//
|
||||
// Small comment: the trick described in
|
||||
// http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor
|
||||
// (replace "movdqa dst, src" with "pshufd $0xE4, src, dst")
|
||||
// did not work: execution time increased from
|
||||
// 1.8 CPU cycles/byte to 2.1 CPU cycles/byte.
|
||||
// So it may be good idea on P4 but it's not on newer CPUs.
|
||||
//
|
||||
// movaps/xorps vs. movdqa/pxor did not make any difference.
|
||||
|
||||
#include "generic_crc.h"
|
||||
#include "uint128_sse2.h"
|
||||
|
||||
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
template<> uint128_sse2
|
||||
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2(
|
||||
const uint8 *src, const uint8 *end, const uint128_sse2 &start) const;
|
||||
|
||||
template<>
|
||||
uint128_sse2 GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
|
||||
const void *data, size_t bytes, const uint128_sse2 &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
uint128_sse2 crc = start ^ this->Base().Canonize();
|
||||
const uint8 *end = src + bytes;
|
||||
if (bytes <= 7) {
|
||||
for (; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64);
|
||||
if (src >= end) {
|
||||
return (crc ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
return CrcMultiwordGccAmd64Sse2(src, end, crc);
|
||||
}
|
||||
|
||||
#define CRC_WORD_ASM() \
|
||||
SSE2_MOVQ " %[crc0], %[tmp0]\n" \
|
||||
"xorq %[tmp0], %[buf0]\n" \
|
||||
"psrldq $8, %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp0]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor (%[table_word], %[tmp0], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp1]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp1], %[tmp1]\n" \
|
||||
"pxor 1*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp0]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor 2*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp1]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp1], %[tmp1]\n" \
|
||||
"pxor 3*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp0]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor 4*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp1]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp1], %[tmp1]\n" \
|
||||
"pxor 5*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf0], %[tmp0]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor 6*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
|
||||
"addq %[buf0], %[buf0]\n" \
|
||||
"pxor 7*256*16(%[table_word], %[buf0], 8), %[crc0]\n"
|
||||
|
||||
template<> uint128_sse2
|
||||
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2(
|
||||
const uint8 *src, const uint8 *end, const uint128_sse2 &start) const {
|
||||
__m128i crc0 = start;
|
||||
__m128i crc1;
|
||||
__m128i crc2;
|
||||
__m128i crc3;
|
||||
__m128i crc_carryover;
|
||||
|
||||
uint64 buf0;
|
||||
uint64 buf1;
|
||||
uint64 buf2;
|
||||
uint64 buf3;
|
||||
|
||||
uint64 tmp0;
|
||||
uint64 tmp1;
|
||||
|
||||
asm(
|
||||
"sub $2*4*8 - 1, %[end]\n"
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 2f\n"
|
||||
|
||||
"pxor %[crc1], %[crc1]\n"
|
||||
"pxor %[crc2], %[crc2]\n"
|
||||
"pxor %[crc3], %[crc3]\n"
|
||||
"pxor %[crc_carryover], %[crc_carryover]\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
"movq 1*8(%[src]), %[buf1]\n"
|
||||
"movq 2*8(%[src]), %[buf2]\n"
|
||||
"movq 3*8(%[src]), %[buf3]\n"
|
||||
|
||||
"1:\n"
|
||||
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
|
||||
#endif
|
||||
#if GCC_VERSION_AVAILABLE(4, 5)
|
||||
// Bug in GCC 4.2.4?
|
||||
"add $4*8, %[src]\n"
|
||||
#else
|
||||
"lea 4*8(%[src]), %[src]\n"
|
||||
#endif
|
||||
"pxor %[crc_carryover], %[crc0]\n"
|
||||
|
||||
SSE2_MOVQ " %[crc0], %[tmp0]\n"
|
||||
"psrldq $8, %[crc0]\n"
|
||||
"xorq %[tmp0], %[buf0]\n"
|
||||
"movzbq %b[buf0], %[tmp0]\n"
|
||||
"pxor %[crc0], %[crc1]\n"
|
||||
"addq %[tmp0], %[tmp0]\n"
|
||||
"shrq $8, %[buf0]\n"
|
||||
"movdqa (%[table], %[tmp0], 8), %[crc0]\n"
|
||||
|
||||
SSE2_MOVQ " %[crc1], %[tmp1]\n"
|
||||
"psrldq $8, %[crc1]\n"
|
||||
"xorq %[tmp1], %[buf1]\n"
|
||||
"movzbq %b[buf1], %[tmp1]\n"
|
||||
"pxor %[crc1], %[crc2]\n"
|
||||
"addq %[tmp1], %[tmp1]\n"
|
||||
"shrq $8, %[buf1]\n"
|
||||
"movdqa (%[table], %[tmp1], 8), %[crc1]\n"
|
||||
|
||||
SSE2_MOVQ " %[crc2], %[tmp0]\n"
|
||||
"psrldq $8, %[crc2]\n"
|
||||
"xorq %[tmp0], %[buf2]\n"
|
||||
"movzbq %b[buf2], %[tmp0]\n"
|
||||
"pxor %[crc2], %[crc3]\n"
|
||||
"addq %[tmp0], %[tmp0]\n"
|
||||
"shrq $8, %[buf2]\n"
|
||||
"movdqa (%[table], %[tmp0], 8), %[crc2]\n"
|
||||
|
||||
SSE2_MOVQ " %[crc3], %[tmp1]\n"
|
||||
"psrldq $8, %[crc3]\n"
|
||||
"xorq %[tmp1], %[buf3]\n"
|
||||
"movzbq %b[buf3], %[tmp1]\n"
|
||||
"movdqa %[crc3], %[crc_carryover]\n"
|
||||
"addq %[tmp1], %[tmp1]\n"
|
||||
"shrq $8, %[buf3]\n"
|
||||
"movdqa (%[table], %[tmp1], 8), %[crc3]\n"
|
||||
|
||||
#define XOR(byte) \
|
||||
"movzbq %b[buf0], %[tmp0]\n" \
|
||||
"shrq $8, %[buf0]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc0]\n" \
|
||||
"movzbq %b[buf1], %[tmp1]\n" \
|
||||
"shrq $8, %[buf1]\n" \
|
||||
"addq %[tmp1], %[tmp1]\n" \
|
||||
"pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc1]\n" \
|
||||
"movzbq %b[buf2], %[tmp0]\n" \
|
||||
"shrq $8, %[buf2]\n" \
|
||||
"addq %[tmp0], %[tmp0]\n" \
|
||||
"pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc2]\n" \
|
||||
"movzbq %b[buf3], %[tmp1]\n" \
|
||||
"shrq $8, %[buf3]\n" \
|
||||
"addq %[tmp1], %[tmp1]\n" \
|
||||
"pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc3]\n"
|
||||
|
||||
XOR(1)
|
||||
XOR(2)
|
||||
XOR(3)
|
||||
XOR(4)
|
||||
XOR(5)
|
||||
XOR(6)
|
||||
#undef XOR
|
||||
|
||||
"addq %[buf0], %[buf0]\n"
|
||||
"pxor 7*256*16(%[table], %[buf0], 8), %[crc0]\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
|
||||
"addq %[buf1], %[buf1]\n"
|
||||
"pxor 7*256*16(%[table], %[buf1], 8), %[crc1]\n"
|
||||
"movq 1*8(%[src]), %[buf1]\n"
|
||||
|
||||
"addq %[buf2], %[buf2]\n"
|
||||
"pxor 7*256*16(%[table], %[buf2], 8), %[crc2]\n"
|
||||
"movq 2*8(%[src]), %[buf2]\n"
|
||||
|
||||
"addq %[buf3], %[buf3]\n"
|
||||
"pxor 7*256*16(%[table], %[buf3], 8), %[crc3]\n"
|
||||
"movq 3*8(%[src]), %[buf3]\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 1b\n"
|
||||
|
||||
"pxor %[crc_carryover], %[crc0]\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"pxor %[crc1], %[crc0]\n"
|
||||
"movq %[buf1], %[buf0]\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"pxor %[crc2], %[crc0]\n"
|
||||
"movq %[buf2], %[buf0]\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"pxor %[crc3], %[crc0]\n"
|
||||
"movq %[buf3], %[buf0]\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"add $4*8, %[src]\n"
|
||||
"2:\n"
|
||||
|
||||
"add $2*4*8 - 8, %[end]\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 4f\n"
|
||||
"3:\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
"addq $8, %[src]\n"
|
||||
CRC_WORD_ASM()
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 3b\n"
|
||||
|
||||
"4:\n"
|
||||
"add $7, %[end]\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 6f\n"
|
||||
|
||||
"5:\n"
|
||||
"movzbq (%[src]), %[buf0]\n"
|
||||
"add $1, %[src]\n"
|
||||
SSE2_MOVQ " %[crc0], %[tmp0]\n"
|
||||
"movzx %b[tmp0], %[tmp0]\n"
|
||||
"psrldq $1, %[crc0]\n"
|
||||
"xor %[buf0], %[tmp0]\n"
|
||||
"addq %[tmp0], %[tmp0]\n"
|
||||
"pxor 7*256*16(%[table_word], %[tmp0], 8), %[crc0]\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 5b\n"
|
||||
|
||||
"6:\n"
|
||||
|
||||
: // outputs
|
||||
[src] "+r" (src),
|
||||
[end] "+r" (end),
|
||||
[crc0] "+x" (crc0),
|
||||
[crc1] "=&x" (crc1),
|
||||
[crc2] "=&x" (crc2),
|
||||
[crc3] "=&x" (crc3),
|
||||
[crc_carryover] "=&x" (crc_carryover),
|
||||
[buf0] "=&r" (buf0),
|
||||
[buf1] "=&r" (buf1),
|
||||
[buf2] "=&r" (buf2),
|
||||
[buf3] "=&r" (buf3),
|
||||
[tmp0] "=&r" (tmp0),
|
||||
[tmp1] "=&r" (tmp1)
|
||||
|
||||
: // inputs
|
||||
[table_word] "r" (this->crc_word_),
|
||||
[table] "r" (this->crc_word_interleaved_));
|
||||
|
||||
return (this->Base().Canonize() ^ crc0);
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2
|
||||
304
crcutil-1.0/code/multiword_64_64_cl_i386_mmx.cc
Normal file
304
crcutil-1.0/code/multiword_64_64_cl_i386_mmx.cc
Normal file
@@ -0,0 +1,304 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements 64-bit multiword CRC for Microsoft and Intel compilers
|
||||
// using MMX instructions (i386).
|
||||
|
||||
#include "generic_crc.h"
|
||||
|
||||
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#define CRC_WORD_MMX() \
|
||||
__asm pxor BUF0, CRC0 \
|
||||
__asm movd TMP0, BUF0 \
|
||||
__asm psrlq BUF0, 32 \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm movq CRC0, [TABLE + TEMP * 8] \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
|
||||
__asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
|
||||
__asm movd TMP0, BUF0 \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
|
||||
__asm movzx TEMP, TMP0L \
|
||||
__asm shr TMP0, 8 \
|
||||
__asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
|
||||
__asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
|
||||
|
||||
// frame pointer register 'ebp' modified by inline assembly code
|
||||
#pragma warning(disable: 4731)
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data,
|
||||
size_t bytes,
|
||||
const uint64 &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
uint64 crc0 = start ^ this->Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
#define CRC0 mm0
|
||||
#define CRC1 mm1
|
||||
#define CRC2 mm2
|
||||
#define CRC3 mm3
|
||||
#define BUF0 mm4
|
||||
#define BUF1 mm5
|
||||
#define BUF2 mm6
|
||||
#define BUF3 mm7
|
||||
#define TMP0 eax
|
||||
#define TMP0L al
|
||||
#define TMP0H ah
|
||||
#define TMP1 ebx
|
||||
#define TMP1L bl
|
||||
#define TMP1H bh
|
||||
#define TMP2 ecx
|
||||
#define TMP2L cl
|
||||
#define TMP2H ch
|
||||
#define TMP3 edx
|
||||
#define TMP3L dl
|
||||
#define TMP3H dh
|
||||
#define TEMP edi
|
||||
#define SRC esi
|
||||
#define END [esp]
|
||||
#define TABLE ebp
|
||||
|
||||
|
||||
const uint64 *interleaved_table_address =
|
||||
&this->crc_word_interleaved_[0][0];
|
||||
const uint64 *word_table_address = &this->crc_word_[0][0];
|
||||
|
||||
__asm {
|
||||
push ebp
|
||||
|
||||
mov TMP0, interleaved_table_address
|
||||
|
||||
movq CRC0, crc0
|
||||
mov SRC, src
|
||||
mov TMP1, end
|
||||
sub TMP1, 2*4*8 - 1
|
||||
cmp SRC, TMP1
|
||||
mov TABLE, word_table_address
|
||||
jae end_main_loop
|
||||
|
||||
push TABLE
|
||||
mov TABLE, TMP0
|
||||
push TMP1
|
||||
|
||||
pxor CRC1, CRC1
|
||||
pxor CRC2, CRC2
|
||||
pxor CRC3, CRC3
|
||||
|
||||
movq BUF0, [SRC]
|
||||
movq BUF1, [SRC + 1 * 8]
|
||||
movq BUF2, [SRC + 2 * 8]
|
||||
movq BUF3, [SRC + 3 * 8]
|
||||
|
||||
main_loop:
|
||||
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
|
||||
#endif
|
||||
add SRC, 32
|
||||
pxor BUF0, CRC0
|
||||
pxor BUF1, CRC1
|
||||
pxor BUF2, CRC2
|
||||
pxor BUF3, CRC3
|
||||
|
||||
movd TMP0, BUF0
|
||||
psrlq BUF0, 32
|
||||
movd TMP1, BUF1
|
||||
psrlq BUF1, 32
|
||||
movd TMP2, BUF2
|
||||
psrlq BUF2, 32
|
||||
movd TMP3, BUF3
|
||||
psrlq BUF3, 32
|
||||
|
||||
movzx TEMP, TMP0L
|
||||
movq CRC0, [TABLE + TEMP * 8]
|
||||
movzx TEMP, TMP1L
|
||||
movq CRC1, [TABLE + TEMP * 8]
|
||||
movzx TEMP, TMP2L
|
||||
movq CRC2, [TABLE + TEMP * 8]
|
||||
movzx TEMP, TMP3L
|
||||
movq CRC3, [TABLE + TEMP * 8]
|
||||
|
||||
movzx TEMP, TMP0H
|
||||
shr TMP0, 16
|
||||
pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
||||
movzx TEMP, TMP1H
|
||||
shr TMP1, 16
|
||||
pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
||||
movzx TEMP, TMP2H
|
||||
shr TMP2, 16
|
||||
pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
||||
movzx TEMP, TMP3H
|
||||
shr TMP3, 16
|
||||
pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
||||
|
||||
movzx TEMP, TMP0L
|
||||
shr TMP0, 8
|
||||
pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
||||
movzx TEMP, TMP1L
|
||||
shr TMP1, 8
|
||||
pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
||||
movzx TEMP, TMP2L
|
||||
shr TMP2, 8
|
||||
pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
||||
movzx TEMP, TMP3L
|
||||
shr TMP3, 8
|
||||
pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
||||
|
||||
pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
|
||||
movd TMP0, BUF0
|
||||
pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
|
||||
movd TMP1, BUF1
|
||||
pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
|
||||
movd TMP2, BUF2
|
||||
pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
|
||||
movd TMP3, BUF3
|
||||
|
||||
movzx TEMP, TMP0L
|
||||
pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
||||
movzx TEMP, TMP1L
|
||||
pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
||||
movzx TEMP, TMP2L
|
||||
pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
||||
movzx TEMP, TMP3L
|
||||
pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
||||
|
||||
movzx TEMP, TMP0H
|
||||
shr TMP0, 16
|
||||
pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
||||
movzx TEMP, TMP1H
|
||||
shr TMP1, 16
|
||||
pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
||||
movzx TEMP, TMP2H
|
||||
shr TMP2, 16
|
||||
pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
||||
movzx TEMP, TMP3H
|
||||
shr TMP3, 16
|
||||
pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
||||
|
||||
movzx TEMP, TMP0L
|
||||
shr TMP0, 8
|
||||
pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
||||
movzx TEMP, TMP1L
|
||||
shr TMP1, 8
|
||||
pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
||||
movzx TEMP, TMP2L
|
||||
shr TMP2, 8
|
||||
pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
||||
movzx TEMP, TMP3L
|
||||
shr TMP3, 8
|
||||
pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
||||
|
||||
pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
|
||||
movq BUF0, [SRC]
|
||||
pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
|
||||
movq BUF1, [SRC + 1 * 8]
|
||||
pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
|
||||
movq BUF2, [SRC + 2 * 8]
|
||||
pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
|
||||
movq BUF3, [SRC + 3 * 8]
|
||||
|
||||
cmp END, SRC
|
||||
ja main_loop
|
||||
|
||||
#undef END
|
||||
#define END TMP1
|
||||
pop END
|
||||
pop TABLE
|
||||
add SRC, 32
|
||||
|
||||
CRC_WORD_MMX()
|
||||
|
||||
pxor BUF1, CRC1
|
||||
movq BUF0, BUF1
|
||||
CRC_WORD_MMX()
|
||||
|
||||
pxor BUF2, CRC2
|
||||
movq BUF0, BUF2
|
||||
CRC_WORD_MMX()
|
||||
|
||||
pxor BUF3, CRC3
|
||||
movq BUF0, BUF3
|
||||
CRC_WORD_MMX()
|
||||
|
||||
end_main_loop:
|
||||
add END, 2*4*8 - 8
|
||||
cmp SRC, END
|
||||
jae end_word_loop
|
||||
|
||||
word_loop:
|
||||
movq BUF0, [SRC]
|
||||
add SRC, 8
|
||||
CRC_WORD_MMX()
|
||||
cmp END, SRC
|
||||
ja word_loop
|
||||
end_word_loop:
|
||||
|
||||
#if 0 // Plain C version is faster?
|
||||
add END, 7
|
||||
cmp SRC, END
|
||||
jae end_byte_loop
|
||||
|
||||
byte_loop:
|
||||
movd TMP0, CRC0
|
||||
movzx TEMP, byte ptr [SRC]
|
||||
movzx TMP0, TMP0L
|
||||
psrlq CRC0, 8
|
||||
xor TEMP, TMP0
|
||||
add SRC, 1
|
||||
pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
|
||||
cmp END, SRC
|
||||
ja byte_loop
|
||||
end_byte_loop:
|
||||
#endif
|
||||
|
||||
pop ebp
|
||||
|
||||
mov src, SRC
|
||||
movq crc0, CRC0
|
||||
|
||||
emms
|
||||
}
|
||||
|
||||
#if 1
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc0, *src);
|
||||
}
|
||||
#endif
|
||||
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
|
||||
298
crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc
Normal file
298
crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc
Normal file
@@ -0,0 +1,298 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements multiword CRC for GCC on AMD64.
|
||||
//
|
||||
// Accoding to "Software Optimization Guide for AMD Family 10h Processors"
|
||||
// http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/40546.pdf
|
||||
// instead of
|
||||
// movzbq %al, %rsi
|
||||
// shrq $8, %rax
|
||||
// [use %rsi]
|
||||
// movzbq %al, %rsi
|
||||
// shrq $8, %rax
|
||||
// [use %rsi]
|
||||
// it is better to use 32-bit registers
|
||||
// (high 32 bits will be cleared on assignment), i.e.
|
||||
// movzbl %al, %esi
|
||||
// [use %rsi]
|
||||
// movzbl %ah, %esi
|
||||
// shrq $16, %rax
|
||||
// [use %rsi]
|
||||
// Makes instructions shorter and removes one shift
|
||||
// (the latter is not such a big deal as it's execution time
|
||||
// is nicely masked by [use %rsi] instruction).
|
||||
//
|
||||
// Performance difference:
|
||||
// About 10% degradation on bytes = 8 .. 16
|
||||
// (clobbering registers that should be saved)
|
||||
// Break even at 32 bytes.
|
||||
// 3% improvement starting from 64 bytes.
|
||||
|
||||
#include "generic_crc.h"
|
||||
|
||||
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64(
|
||||
const void *data, size_t bytes, const uint64 &start) const;
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
|
||||
const void *data,
|
||||
size_t bytes,
|
||||
const uint64 &start) const {
|
||||
if (bytes <= 6 * sizeof(Word) - 1) {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
uint64 crc = start ^ this->Base().Canonize();
|
||||
const uint8 *end = src + bytes;
|
||||
#define PROCESS_ONE_WORD() do { \
|
||||
Word buf = reinterpret_cast<const Word *>(src)[0]; \
|
||||
CRC_WORD(this, crc, buf); \
|
||||
src += sizeof(Word); \
|
||||
} while (0)
|
||||
if (bytes >= 1 * sizeof(Word)) {
|
||||
PROCESS_ONE_WORD();
|
||||
if (bytes >= 2 * sizeof(Word)) {
|
||||
PROCESS_ONE_WORD();
|
||||
if (bytes >= 3 * sizeof(Word)) {
|
||||
PROCESS_ONE_WORD();
|
||||
if (bytes >= 4 * sizeof(Word)) {
|
||||
PROCESS_ONE_WORD();
|
||||
if (bytes >= 5 * sizeof(Word)) {
|
||||
PROCESS_ONE_WORD();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ this->Base().Canonize());
|
||||
}
|
||||
return this->CrcMultiwordGccAmd64(data, bytes, start);
|
||||
}
|
||||
|
||||
#define TMP0 "%%rsi"
|
||||
#define TMP0W "%%esi"
|
||||
|
||||
#define BUF0 "%%rax"
|
||||
#define BUF0L "%%al"
|
||||
#define BUF0H "%%ah"
|
||||
|
||||
#define BUF1 "%%rbx"
|
||||
#define BUF1L "%%bl"
|
||||
#define BUF1H "%%bh"
|
||||
|
||||
#define BUF2 "%%rcx"
|
||||
#define BUF2L "%%cl"
|
||||
#define BUF2H "%%ch"
|
||||
|
||||
#define BUF3 "%%rdx"
|
||||
#define BUF3L "%%dl"
|
||||
#define BUF3H "%%dh"
|
||||
|
||||
#define CRC_WORD_ASM() \
|
||||
"xorq %[crc0], " BUF0 "\n" \
|
||||
"movzbq " BUF0L ", " TMP0 "\n" \
|
||||
"movq (%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF0H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF0 "\n" \
|
||||
"xorq 1*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbq " BUF0L ", " TMP0 "\n" \
|
||||
"xorq 2*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF0H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF0 "\n" \
|
||||
"xorq 3*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbq " BUF0L ", " TMP0 "\n" \
|
||||
"xorq 4*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF0H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF0 "\n" \
|
||||
"xorq 5*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbq " BUF0L ", " TMP0 "\n" \
|
||||
"xorq 6*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF0H ", " TMP0W "\n" \
|
||||
"xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n"
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64(
|
||||
const void *data, size_t bytes, const uint64 &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
uint64 crc0 = start ^ this->Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
uint64 crc1;
|
||||
uint64 crc2;
|
||||
uint64 crc3;
|
||||
|
||||
asm(
|
||||
"sub $2*4*8 - 1, %[end]\n"
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 2f\n"
|
||||
"xorq %[crc1], %[crc1]\n"
|
||||
"movq (%[src]), " BUF0 "\n"
|
||||
"movq 1*8(%[src]), " BUF1 "\n"
|
||||
"movq 2*8(%[src]), " BUF2 "\n"
|
||||
"movq 3*8(%[src]), " BUF3 "\n"
|
||||
"movq %[crc1], %[crc2]\n"
|
||||
"movq %[crc1], %[crc3]\n"
|
||||
|
||||
"1:\n"
|
||||
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
|
||||
#endif // HAVE_SSE
|
||||
"add $4*8, %[src]\n"
|
||||
|
||||
// Set buffer data.
|
||||
"xorq %[crc0], " BUF0 "\n"
|
||||
"xorq %[crc1], " BUF1 "\n"
|
||||
"xorq %[crc2], " BUF2 "\n"
|
||||
"xorq %[crc3], " BUF3 "\n"
|
||||
|
||||
// LOAD crc of byte 0 and shift buffers.
|
||||
"movzbl " BUF0L ", " TMP0W "\n"
|
||||
"movq (%[table], " TMP0 ", 8), %[crc0]\n"
|
||||
"movzbl " BUF1L ", " TMP0W "\n"
|
||||
"movq (%[table], " TMP0 ", 8), %[crc1]\n"
|
||||
"movzbl " BUF2L ", " TMP0W "\n"
|
||||
"movq (%[table], " TMP0 ", 8), %[crc2]\n"
|
||||
"movzbl " BUF3L ", " TMP0W "\n"
|
||||
"movq (%[table], " TMP0 ", 8), %[crc3]\n"
|
||||
|
||||
#define XOR1(byte1) \
|
||||
"movzbl " BUF0L ", " TMP0W "\n" \
|
||||
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF1L ", " TMP0W "\n" \
|
||||
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \
|
||||
"movzbl " BUF2L ", " TMP0W "\n" \
|
||||
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \
|
||||
"movzbl " BUF3L ", " TMP0W "\n" \
|
||||
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
|
||||
|
||||
#define XOR2(byte2) \
|
||||
"movzbl " BUF0H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF0 "\n" \
|
||||
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \
|
||||
"movzbl " BUF1H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF1 "\n" \
|
||||
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \
|
||||
"movzbl " BUF2H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF2 "\n" \
|
||||
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \
|
||||
"movzbl " BUF3H ", " TMP0W "\n" \
|
||||
"shrq $16, " BUF3 "\n" \
|
||||
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
|
||||
|
||||
XOR2(1)
|
||||
XOR1(2)
|
||||
XOR2(3)
|
||||
XOR1(4)
|
||||
XOR2(5)
|
||||
XOR1(6)
|
||||
|
||||
// Update CRC registers and load buffers.
|
||||
"movzbl " BUF0H ", " TMP0W "\n"
|
||||
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc0]\n"
|
||||
"movq (%[src]), " BUF0 "\n"
|
||||
"movzbl " BUF1H ", " TMP0W "\n"
|
||||
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc1]\n"
|
||||
"movq 1*8(%[src]), " BUF1 "\n"
|
||||
"movzbl " BUF2H ", " TMP0W "\n"
|
||||
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc2]\n"
|
||||
"movq 2*8(%[src]), " BUF2 "\n"
|
||||
"movzbl " BUF3H ", " TMP0W "\n"
|
||||
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
|
||||
"movq 3*8(%[src]), " BUF3 "\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 1b\n"
|
||||
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"xorq %[crc1], " BUF1 "\n"
|
||||
"movq " BUF1 ", " BUF0 "\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"xorq %[crc2], " BUF2 "\n"
|
||||
"movq " BUF2 ", " BUF0 "\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"xorq %[crc3], " BUF3 "\n"
|
||||
"movq " BUF3 ", " BUF0 "\n"
|
||||
CRC_WORD_ASM()
|
||||
|
||||
"add $4*8, %[src]\n"
|
||||
|
||||
"2:\n"
|
||||
"add $2*4*8 - 8, %[end]\n"
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 4f\n"
|
||||
|
||||
"3:\n"
|
||||
"movq (%[src]), " BUF0 "\n"
|
||||
"add $8, %[src]\n"
|
||||
CRC_WORD_ASM()
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 3b\n"
|
||||
|
||||
"4:\n"
|
||||
"add $7, %[end]\n"
|
||||
|
||||
"cmpq %[src], %[end]\n"
|
||||
"jbe 6f\n"
|
||||
|
||||
"5:\n"
|
||||
"movzbq (%[src]), " BUF0 "\n"
|
||||
"movzbq %b[crc0], " TMP0 "\n"
|
||||
"shrq $8, %[crc0]\n"
|
||||
"xorq " BUF0 ", " TMP0 "\n"
|
||||
"add $1, %[src]\n"
|
||||
"xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n"
|
||||
"cmpq %[src], %[end]\n"
|
||||
"ja 5b\n"
|
||||
|
||||
"6:\n"
|
||||
|
||||
|
||||
: // outputs
|
||||
[src] "+r" (src),
|
||||
[end] "+r" (end),
|
||||
[crc0] "+r" (crc0),
|
||||
[crc1] "=&r" (crc1),
|
||||
[crc2] "=&r" (crc2),
|
||||
[crc3] "=&r" (crc3)
|
||||
|
||||
: // inputs
|
||||
[table] "r" (&this->crc_word_interleaved_[0][0]),
|
||||
[table_word] "r" (&this->crc_word_[0][0])
|
||||
|
||||
: // clobbers
|
||||
"%rax", // BUF0
|
||||
"%rbx", // BUF1
|
||||
"%rcx", // BUF2
|
||||
"%rdx", // BUF3
|
||||
"%rsi" // TMP0
|
||||
);
|
||||
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM
|
||||
261
crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc
Normal file
261
crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc
Normal file
@@ -0,0 +1,261 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements multiword CRC for GCC on i386.
|
||||
|
||||
#include "generic_crc.h"
|
||||
|
||||
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data, size_t bytes, const uint64 &start)
|
||||
const GCC_OMIT_FRAME_POINTER;
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
|
||||
const void *data, size_t bytes, const uint64 &start) const {
|
||||
if (bytes <= 7) {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
uint64 crc = start ^ this->Base().Canonize();
|
||||
for (const uint8 *end = src + bytes; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ this->Base().Canonize());
|
||||
}
|
||||
return CrcMultiwordI386Mmx(data, bytes, start);
|
||||
}
|
||||
|
||||
#define CRC_WORD_MMX() \
|
||||
"pxor %[crc0], %[buf0]\n" \
|
||||
"movd %[buf0], %[tmp0]\n" \
|
||||
"psrlq $32, %[buf0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"movq (%[table], %[temp], 8), %[crc0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor 1*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor 2*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" \
|
||||
"movd %[buf0], %[tmp0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor 4*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor 5*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor 6*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n"
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data, size_t bytes, const uint64 &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
uint64 crc0 = start ^ this->Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
|
||||
if (src >= end) {
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
uint64 crc1;
|
||||
uint64 crc2;
|
||||
uint64 crc3;
|
||||
|
||||
uint64 buf0;
|
||||
uint64 buf1;
|
||||
uint64 buf2;
|
||||
uint64 buf3;
|
||||
|
||||
uint32 tmp0;
|
||||
uint32 tmp1;
|
||||
uint32 tmp2;
|
||||
uint32 tmp3;
|
||||
|
||||
uint32 temp;
|
||||
|
||||
void *table_ptr;
|
||||
const uint64 *table_interleaved = &this->crc_word_interleaved_[0][0];
|
||||
const uint64 *table_word = &this->crc_word_[0][0];
|
||||
|
||||
asm(
|
||||
"sub $2*4*8 - 1, %[end]\n"
|
||||
"cmpl %[src], %[end]\n"
|
||||
"jbe 2f\n"
|
||||
|
||||
"pxor %[crc1], %[crc1]\n"
|
||||
"pxor %[crc2], %[crc2]\n"
|
||||
"pxor %[crc3], %[crc3]\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
"movq 1*8(%[src]), %[buf1]\n"
|
||||
"movq 2*8(%[src]), %[buf2]\n"
|
||||
"movq 3*8(%[src]), %[buf3]\n"
|
||||
|
||||
"movl %[table_interleaved], %[table]\n"
|
||||
"1:\n"
|
||||
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
|
||||
#endif
|
||||
"addl $0x20, %[src]\n"
|
||||
"pxor %[crc0], %[buf0]\n"
|
||||
"pxor %[crc1], %[buf1]\n"
|
||||
"pxor %[crc2], %[buf2]\n"
|
||||
"pxor %[crc3], %[buf3]\n"
|
||||
|
||||
"movd %[buf0], %[tmp0]\n"
|
||||
"psrlq $32, %[buf0]\n"
|
||||
"movd %[buf1], %[tmp1]\n"
|
||||
"psrlq $32, %[buf1]\n"
|
||||
"movd %[buf2], %[tmp2]\n"
|
||||
"psrlq $32, %[buf2]\n"
|
||||
"movd %[buf3], %[tmp3]\n"
|
||||
"psrlq $32, %[buf3]\n"
|
||||
|
||||
"movzbl %b[tmp0], %[temp]\n"
|
||||
"shrl $8, %[tmp0]\n"
|
||||
"movq (%[table], %[temp], 8), %[crc0]\n"
|
||||
"movzbl %b[tmp1], %[temp]\n"
|
||||
"shrl $8, %[tmp1]\n"
|
||||
"movq (%[table], %[temp], 8), %[crc1]\n"
|
||||
"movzbl %b[tmp2], %[temp]\n"
|
||||
"shrl $8, %[tmp2]\n"
|
||||
"movq (%[table], %[temp], 8), %[crc2]\n"
|
||||
"movzbl %b[tmp3], %[temp]\n"
|
||||
"shrl $8, %[tmp3]\n"
|
||||
"movq (%[table], %[temp], 8), %[crc3]\n"
|
||||
|
||||
#define XOR(byte) \
|
||||
"movzbl %b[tmp0], %[temp]\n" \
|
||||
"shrl $8, %[tmp0]\n" \
|
||||
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc0]\n" \
|
||||
"movzbl %b[tmp1], %[temp]\n" \
|
||||
"shrl $8, %[tmp1]\n" \
|
||||
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc1]\n" \
|
||||
"movzbl %b[tmp2], %[temp]\n" \
|
||||
"shrl $8, %[tmp2]\n" \
|
||||
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc2]\n" \
|
||||
"movzbl %b[tmp3], %[temp]\n" \
|
||||
"shrl $8, %[tmp3]\n" \
|
||||
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc3]\n"
|
||||
|
||||
XOR(1)
|
||||
XOR(2)
|
||||
|
||||
"pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n"
|
||||
"movd %[buf0], %[tmp0]\n"
|
||||
"pxor 3*256*8(%[table], %[tmp1], 8), %[crc1]\n"
|
||||
"movd %[buf1], %[tmp1]\n"
|
||||
"pxor 3*256*8(%[table], %[tmp2], 8), %[crc2]\n"
|
||||
"movd %[buf2], %[tmp2]\n"
|
||||
"pxor 3*256*8(%[table], %[tmp3], 8), %[crc3]\n"
|
||||
"movd %[buf3], %[tmp3]\n"
|
||||
|
||||
XOR(4)
|
||||
XOR(5)
|
||||
XOR(6)
|
||||
|
||||
"pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
"pxor 7*256*8(%[table], %[tmp1], 8), %[crc1]\n"
|
||||
"movq 1*8(%[src]), %[buf1]\n"
|
||||
"pxor 7*256*8(%[table], %[tmp2], 8), %[crc2]\n"
|
||||
"movq 2*8(%[src]), %[buf2]\n"
|
||||
"pxor 7*256*8(%[table], %[tmp3], 8), %[crc3]\n"
|
||||
"movq 3*8(%[src]), %[buf3]\n"
|
||||
"cmpl %[src], %[end]\n"
|
||||
"ja 1b\n"
|
||||
#undef XOR
|
||||
|
||||
"movl %[table_word], %[table]\n"
|
||||
CRC_WORD_MMX()
|
||||
|
||||
"pxor %[crc1], %[buf1]\n"
|
||||
"movq %[buf1], %[buf0]\n"
|
||||
CRC_WORD_MMX()
|
||||
|
||||
"pxor %[crc2], %[buf2]\n"
|
||||
"movq %[buf2], %[buf0]\n"
|
||||
CRC_WORD_MMX()
|
||||
|
||||
"pxor %[crc3], %[buf3]\n"
|
||||
"movq %[buf3], %[buf0]\n"
|
||||
CRC_WORD_MMX()
|
||||
|
||||
"add $4*8, %[src]\n"
|
||||
"2:\n"
|
||||
"movl %[table_word], %[table]\n"
|
||||
|
||||
"add $2*4*8 - 8, %[end]\n"
|
||||
"cmpl %[src], %[end]\n"
|
||||
"jbe 4f\n"
|
||||
"3:\n"
|
||||
"movq (%[src]), %[buf0]\n"
|
||||
"addl $0x8, %[src]\n"
|
||||
CRC_WORD_MMX()
|
||||
"cmpl %[src], %[end]\n"
|
||||
"ja 3b\n"
|
||||
"4:\n"
|
||||
"add $7, %[end]\n"
|
||||
|
||||
"cmpl %[src], %[end]\n"
|
||||
"jbe 6f\n"
|
||||
|
||||
"5:\n"
|
||||
"movd %[crc0], %[tmp0]\n"
|
||||
"movzbl (%[src]), %[temp]\n"
|
||||
"movzbl %b[tmp0], %[tmp0]\n"
|
||||
"psrlq $8, %[crc0]\n"
|
||||
"xorl %[tmp0], %[temp]\n"
|
||||
"add $1, %[src]\n"
|
||||
"pxor 7*256*8(%[table], %[temp], 8), %[crc0]\n"
|
||||
"cmpl %[src], %[end]\n"
|
||||
"ja 5b\n"
|
||||
|
||||
"6:\n"
|
||||
|
||||
: // outputs
|
||||
[src] "+r" (src),
|
||||
[end] "+m" (end),
|
||||
[crc0] "+y" (crc0),
|
||||
[crc1] "=&y" (crc1),
|
||||
[crc2] "=&y" (crc2),
|
||||
[crc3] "=&y" (crc3),
|
||||
[buf0] "=&y" (buf0),
|
||||
[buf1] "=&y" (buf1),
|
||||
[buf2] "=&y" (buf2),
|
||||
[buf3] "=&y" (buf3),
|
||||
[tmp0] "=&q" (tmp0),
|
||||
[tmp1] "=&q" (tmp1),
|
||||
[tmp2] "=&q" (tmp2),
|
||||
[tmp3] "=&q" (tmp3),
|
||||
[temp] "=&r" (temp),
|
||||
[table] "=&r" (table_ptr)
|
||||
|
||||
: // inputs
|
||||
[table_interleaved] "m" (table_interleaved),
|
||||
[table_word] "m" (table_word));
|
||||
|
||||
asm volatile("emms");
|
||||
|
||||
return (crc0 ^ this->Base().Canonize());
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM
|
||||
243
crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc
Normal file
243
crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc
Normal file
@@ -0,0 +1,243 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements 64-bit multiword CRC using MMX built-in functions.
|
||||
|
||||
#include "generic_crc.h"
|
||||
|
||||
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data, size_t bytes, const uint64 &start)
|
||||
const GCC_OMIT_FRAME_POINTER;
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
|
||||
const void *data,
|
||||
size_t bytes,
|
||||
const uint64 &start) const {
|
||||
if (bytes <= 7) {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
uint64 crc = start ^ Base().Canonize();
|
||||
for (const uint8 *end = src + bytes; src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
return CrcMultiwordI386Mmx(data, bytes, start);
|
||||
}
|
||||
#else
|
||||
#pragma warning(push)
|
||||
// CL: uninitialized local variable 'crc1' used
|
||||
// Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
|
||||
#pragma warning(disable: 4700)
|
||||
|
||||
#pragma warning(disable: 4619) // there is no warning number '592'
|
||||
|
||||
// ICL: variable "crc1" is used before its value is set
|
||||
// Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
|
||||
#pragma warning(disable: 592)
|
||||
#endif // !defined(_MSC_VER)
|
||||
|
||||
#define MM64(adr) reinterpret_cast<const __m64 *>(adr)
|
||||
#define MM64_TABLE(byte) MM64(crc_word_interleaved_[byte])
|
||||
|
||||
#define CRC_WORD_MMX(this, crc, buf) do { \
|
||||
buf = _mm_xor_si64(buf, crc); \
|
||||
uint32 tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
|
||||
buf = _mm_srli_si64(buf, 32); \
|
||||
crc = MM64(crc_word_[0])[TO_BYTE(tmp)]; \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[1])[TO_BYTE(tmp)]); \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[2])[TO_BYTE(tmp)]); \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[3])[tmp]); \
|
||||
tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[4])[TO_BYTE(tmp)]); \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[5])[TO_BYTE(tmp)]); \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[6])[TO_BYTE(tmp)]); \
|
||||
tmp >>= 8; \
|
||||
crc = _mm_xor_si64(crc, MM64(crc_word_[7])[tmp]); \
|
||||
} while (0)
|
||||
|
||||
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
||||
const void *data, size_t bytes, const uint64 &start) const {
|
||||
const uint8 *src = static_cast<const uint8 *>(data);
|
||||
const uint8 *end = src + bytes;
|
||||
uint64 crc = start ^ Base().Canonize();
|
||||
|
||||
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64);
|
||||
if (src >= end) {
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
|
||||
// Process 4 registers of sizeof(uint64) bytes at once.
|
||||
bytes = static_cast<size_t>(end - src) & ~(4*8 - 1);
|
||||
if (bytes > 4*8) {
|
||||
const uint8 *stop = src + bytes - 4*8;
|
||||
union {
|
||||
__m64 m64;
|
||||
uint64 u64;
|
||||
} temp;
|
||||
__m64 crc0;
|
||||
__m64 crc1;
|
||||
__m64 crc2;
|
||||
__m64 crc3;
|
||||
__m64 buf0 = MM64(src)[0];
|
||||
__m64 buf1 = MM64(src)[1];
|
||||
__m64 buf2 = MM64(src)[2];
|
||||
__m64 buf3 = MM64(src)[3];
|
||||
|
||||
temp.u64 = crc;
|
||||
crc0 = temp.m64;
|
||||
#if defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
|
||||
// There is no way to suppress a warning in GCC;
|
||||
// generate extra assignments.
|
||||
temp.u64 = 0;
|
||||
crc1 = temp.m64;
|
||||
crc2 = temp.m64;
|
||||
crc3 = temp.m64;
|
||||
#else
|
||||
crc1 = _mm_xor_si64(crc1, crc1);
|
||||
crc2 = _mm_xor_si64(crc2, crc2);
|
||||
crc3 = _mm_xor_si64(crc3, crc3);
|
||||
#endif // defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
|
||||
|
||||
do {
|
||||
PREFETCH(src);
|
||||
src += 4*8;
|
||||
|
||||
buf0 = _mm_xor_si64(buf0, crc0);
|
||||
buf1 = _mm_xor_si64(buf1, crc1);
|
||||
buf2 = _mm_xor_si64(buf2, crc2);
|
||||
buf3 = _mm_xor_si64(buf3, crc3);
|
||||
|
||||
uint32 tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
|
||||
uint32 tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
|
||||
uint32 tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
|
||||
uint32 tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
|
||||
|
||||
buf0 = _mm_srli_si64(buf0, 32);
|
||||
buf1 = _mm_srli_si64(buf1, 32);
|
||||
buf2 = _mm_srli_si64(buf2, 32);
|
||||
buf3 = _mm_srli_si64(buf3, 32);
|
||||
|
||||
crc0 = MM64_TABLE(0)[TO_BYTE(tmp0)];
|
||||
tmp0 >>= 8;
|
||||
crc1 = MM64_TABLE(0)[TO_BYTE(tmp1)];
|
||||
tmp1 >>= 8;
|
||||
crc2 = MM64_TABLE(0)[TO_BYTE(tmp2)];
|
||||
tmp2 >>= 8;
|
||||
crc3 = MM64_TABLE(0)[TO_BYTE(tmp3)];
|
||||
tmp3 >>= 8;
|
||||
|
||||
#define XOR(byte) do { \
|
||||
crc0 = _mm_xor_si64(crc0, MM64_TABLE(byte)[TO_BYTE(tmp0)]); \
|
||||
tmp0 >>= 8; \
|
||||
crc1 = _mm_xor_si64(crc1, MM64_TABLE(byte)[TO_BYTE(tmp1)]); \
|
||||
tmp1 >>= 8; \
|
||||
crc2 = _mm_xor_si64(crc2, MM64_TABLE(byte)[TO_BYTE(tmp2)]); \
|
||||
tmp2 >>= 8; \
|
||||
crc3 = _mm_xor_si64(crc3, MM64_TABLE(byte)[TO_BYTE(tmp3)]); \
|
||||
tmp3 >>= 8; \
|
||||
} while (0)
|
||||
|
||||
XOR(1);
|
||||
XOR(2);
|
||||
|
||||
crc0 = _mm_xor_si64(crc0, MM64_TABLE(3)[tmp0]);
|
||||
tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
|
||||
crc1 = _mm_xor_si64(crc1, MM64_TABLE(3)[tmp1]);
|
||||
tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
|
||||
crc2 = _mm_xor_si64(crc2, MM64_TABLE(3)[tmp2]);
|
||||
tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
|
||||
crc3 = _mm_xor_si64(crc3, MM64_TABLE(3)[tmp3]);
|
||||
tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
|
||||
|
||||
XOR(4);
|
||||
XOR(5);
|
||||
XOR(6);
|
||||
|
||||
#undef XOR
|
||||
|
||||
crc0 = _mm_xor_si64(crc0, MM64_TABLE(sizeof(uint64) - 1)[tmp0]);
|
||||
buf0 = MM64(src)[0];
|
||||
crc1 = _mm_xor_si64(crc1, MM64_TABLE(sizeof(uint64) - 1)[tmp1]);
|
||||
buf1 = MM64(src)[1];
|
||||
crc2 = _mm_xor_si64(crc2, MM64_TABLE(sizeof(uint64) - 1)[tmp2]);
|
||||
buf2 = MM64(src)[2];
|
||||
crc3 = _mm_xor_si64(crc3, MM64_TABLE(sizeof(uint64) - 1)[tmp3]);
|
||||
buf3 = MM64(src)[3];
|
||||
}
|
||||
while (src < stop);
|
||||
|
||||
CRC_WORD_MMX(this, crc0, buf0);
|
||||
buf1 = _mm_xor_si64(buf1, crc1);
|
||||
CRC_WORD_MMX(this, crc0, buf1);
|
||||
buf2 = _mm_xor_si64(buf2, crc2);
|
||||
CRC_WORD_MMX(this, crc0, buf2);
|
||||
buf3 = _mm_xor_si64(buf3, crc3);
|
||||
CRC_WORD_MMX(this, crc0, buf3);
|
||||
|
||||
temp.m64 = crc0;
|
||||
crc = temp.u64;
|
||||
|
||||
_mm_empty();
|
||||
|
||||
src += 4*8;
|
||||
}
|
||||
|
||||
// Process sizeof(uint64) bytes at once.
|
||||
bytes = static_cast<size_t>(end - src) & ~(sizeof(uint64) - 1);
|
||||
if (bytes > 0) {
|
||||
union {
|
||||
__m64 m64;
|
||||
uint64 u64;
|
||||
} temp;
|
||||
__m64 crc0;
|
||||
|
||||
temp.u64 = crc;
|
||||
crc0 = temp.m64;
|
||||
|
||||
for (const uint8 *stop = src + bytes; src < stop; src += sizeof(uint64)) {
|
||||
__m64 buf0 = MM64(src)[0];
|
||||
CRC_WORD_MMX(this, crc0, buf0);
|
||||
}
|
||||
|
||||
temp.m64 = crc0;
|
||||
crc = temp.u64;
|
||||
|
||||
_mm_empty();
|
||||
}
|
||||
|
||||
// Compute CRC of remaining bytes.
|
||||
for (;src < end; ++src) {
|
||||
CRC_BYTE(this, crc, *src);
|
||||
}
|
||||
|
||||
return (crc ^ Base().Canonize());
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
|
||||
245
crcutil-1.0/code/platform.h
Normal file
245
crcutil-1.0/code/platform.h
Normal file
@@ -0,0 +1,245 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Detects configuration and defines compiler-specific macros.
|
||||
// Also, sets user-defined CRUTIL_USE_* macros to default values.
|
||||
|
||||
#ifndef CRCUTIL_PLATFORM_H_
|
||||
#define CRCUTIL_PLATFORM_H_
|
||||
|
||||
// Permanently disable some annoying warnings generated
|
||||
// by Microsoft CL when compiling Microsoft's headers.
|
||||
#include "std_headers.h"
|
||||
|
||||
// Use inline asm version of the code?
|
||||
#if !defined(CRCUTIL_USE_ASM)
|
||||
#define CRCUTIL_USE_ASM 1
|
||||
#endif // !defined(CRCUTIL_USE_ASM)
|
||||
|
||||
|
||||
#if !defined(HAVE_I386)
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HAVE_I386 1
|
||||
#else
|
||||
#define HAVE_I386 0
|
||||
#endif // defined(__i386__) || defined(_M_IX86)
|
||||
#endif // defined(HAVE_I386)
|
||||
|
||||
|
||||
#if !defined(HAVE_AMD64)
|
||||
#if defined(__amd64__) || defined(_M_AMD64)
|
||||
#define HAVE_AMD64 1
|
||||
#else
|
||||
#define HAVE_AMD64 0
|
||||
#endif // defined(__amd64__) || defined(_M_AMD64)
|
||||
#endif // defined(HAVE_AMD64)
|
||||
|
||||
|
||||
#if HAVE_AMD64 || HAVE_I386
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
// '_M_IX86' is not defined as a preprocessor macro
|
||||
#pragma warning(disable: 4668)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
|
||||
#if !defined(HAVE_MMX)
|
||||
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__))
|
||||
#define HAVE_MMX 1
|
||||
#else
|
||||
#define HAVE_MMX 0
|
||||
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__))
|
||||
#endif // !defined(HAVE_MMX)
|
||||
|
||||
|
||||
#if !defined(HAVE_SSE)
|
||||
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__))
|
||||
#include <xmmintrin.h>
|
||||
#define HAVE_SSE 1
|
||||
#else
|
||||
#define HAVE_SSE 0
|
||||
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__))
|
||||
#endif // !defined(HAVE_SSE)
|
||||
|
||||
|
||||
#if !defined(HAVE_SSE2)
|
||||
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__))
|
||||
#include <emmintrin.h>
|
||||
#define HAVE_SSE2 1
|
||||
#else
|
||||
#define HAVE_SSE2 0
|
||||
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__))
|
||||
#endif // !defined(HAVE_SSE2)
|
||||
|
||||
#else
|
||||
|
||||
#if !defined(HAVE_MMX)
|
||||
#define HAVE_MMX 0
|
||||
#endif // !defined(HAVE_MMX)
|
||||
|
||||
#if !defined(HAVE_SSE)
|
||||
#define HAVE_SSE 0
|
||||
#endif // !defined(HAVE_SSE)
|
||||
|
||||
#if !defined(HAVE_SSE2)
|
||||
#define HAVE_SSE2 0
|
||||
#endif // !defined(HAVE_SSE2)
|
||||
|
||||
#endif // HAVE_AMD64 || HAVE_I386
|
||||
|
||||
// Error checking
|
||||
#if HAVE_SSE && !HAVE_MMX
|
||||
#error SSE is available but not MMX?
|
||||
#endif // HAVE_SSE && !HAVE_MMX
|
||||
|
||||
#if HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX)
|
||||
#error SSE2 is available but not SSE or MMX?
|
||||
#endif // HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX)
|
||||
|
||||
|
||||
#if !defined(CRCUTIL_PREFETCH_WIDTH)
|
||||
// On newer X5550 CPU, heavily optimized CrcMultiword is 3% faster without
|
||||
// prefetch for inputs smaller than 8MB and less than 1% slower for 8MB and
|
||||
// larger blocks. On older Q9650 CPU, the code is 2-3% faster for inputs
|
||||
// smaller than 8MB, 4-5% slower when length >= 8MB.
|
||||
// Tested with prefetch length 256, 512, and 4096.
|
||||
//
|
||||
// At this moment there is no compelling reason to use prefetching.
|
||||
//
|
||||
#define CRCUTIL_PREFETCH_WIDTH 0
|
||||
#endif // !defined(CRCUTIL_PREFETCH_WIDTH)
|
||||
|
||||
|
||||
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
#define PREFETCH(src) \
|
||||
_mm_prefetch(reinterpret_cast<const char *>(src) + CRCUTIL_PREFETCH_WIDTH, \
|
||||
_MM_HINT_T0)
|
||||
#else
|
||||
#define PREFETCH(src)
|
||||
#endif // HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
||||
|
||||
|
||||
// If block size exceeds CRCUTIL_MIN_ALIGN_SIZE, align the data
|
||||
// before accessing it at word boundary. See generic_crc.cc,
|
||||
// ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() macro.
|
||||
#if !defined(CRCUTIL_MIN_ALIGN_SIZE)
|
||||
#if HAVE_AMD64 || HAVE_I386
|
||||
#define CRCUTIL_MIN_ALIGN_SIZE (1024)
|
||||
#else
|
||||
#define CRCUTIL_MIN_ALIGN_SIZE 0
|
||||
#endif // HAVE_AMD64 || HAVE_I386
|
||||
#endif // !defined(CRCUTIL_MIN_ALIGN_SIZE)
|
||||
|
||||
|
||||
// Use _mm_crc32_u64/32/8 intrinics?
|
||||
// If not, they will be implemented in software.
|
||||
#if !HAVE_I386 && !HAVE_AMD64
|
||||
|
||||
#undef CRCUTIL_USE_MM_CRC32
|
||||
#define CRCUTIL_USE_MM_CRC32 0
|
||||
|
||||
#else
|
||||
|
||||
#if !defined(CRCUTIL_USE_MM_CRC32)
|
||||
#if defined(_MSC_VER) || defined(__GNUC__)
|
||||
#define CRCUTIL_USE_MM_CRC32 1
|
||||
#else
|
||||
#define CRCUTIL_USE_MM_CRC32 0
|
||||
#endif // defined(_MSC_VER) || defined(__GNUC__)
|
||||
#endif // !defined(CRCUTIL_USE_MM_CRC32)
|
||||
|
||||
#endif // !HAVE_I386 && !HAVE_AMD64
|
||||
|
||||
|
||||
// Stringize -- always handy.
|
||||
#define TO_STRING_VALUE(arg) #arg
|
||||
#define TO_STRING(arg) TO_STRING_VALUE(arg)
|
||||
|
||||
|
||||
// Compilers give "right shift count >= width of type" warning even
|
||||
// though the shift happens only under appropriate "if".
|
||||
#define SHIFT_RIGHT_NO_WARNING(value, bits) \
|
||||
((value) >> (((bits) < (8 * sizeof(value))) ? (bits) : 0))
|
||||
#define SHIFT_RIGHT_SAFE(value, bits) \
|
||||
((bits) < (8 * sizeof(value)) ? SHIFT_RIGHT_NO_WARNING(value, bits) : 0)
|
||||
|
||||
// The same for left shifts.
|
||||
#define SHIFT_LEFT_NO_WARNING(value, bits) \
|
||||
((value) << (((bits) < (8 * sizeof(value))) ? (bits) : 0))
|
||||
#define SHIFT_LEFT_SAFE(value, bits) \
|
||||
((bits) < (8 * sizeof(value)) ? SHIFT_LEFT_NO_WARNING(value, bits) : 0)
|
||||
|
||||
// GCC-specific macros.
|
||||
//
|
||||
#define GCC_VERSION_AVAILABLE(major, minor) \
|
||||
(defined(__GNUC__) && \
|
||||
(__GNUC__ > (major) || \
|
||||
(__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
|
||||
|
||||
|
||||
#if defined(__GNUC__)
|
||||
|
||||
// The GenericCrc tables must be properly aligned.
|
||||
// Penalty for misalignment? 50% performance degradation.
|
||||
// For 128-bit SSE2, the penalty is access violation.
|
||||
#define GCC_ALIGN_ATTRIBUTE(n) __attribute__((aligned(n)))
|
||||
|
||||
#if GCC_VERSION_AVAILABLE(4, 4)
|
||||
// If not marked as "omit frame pointer",
|
||||
// GCC won't be able to find enough registers.
|
||||
#define GCC_OMIT_FRAME_POINTER \
|
||||
__attribute__((__optimize__(2, "omit-frame-pointer")))
|
||||
#endif // GCC_VERSION_AVAILABLE(4, 4)
|
||||
|
||||
#if !defined(__forceinline)
|
||||
#define __forceinline __attribute__((__always_inline__)) inline
|
||||
#endif // !defined(__forceinline)
|
||||
|
||||
#if defined(__APPLE_CC__)
|
||||
// The version of GCC used by Max OS X xCode v 5664 does not understand
|
||||
// "movq xmm, r64" instruction and requires the use of "movd" (probably
|
||||
// because of the bug in GCC which treats "movq/movd xmm,r64 or r64,xmm"
|
||||
// the same).
|
||||
//
|
||||
// Leaving common sense aside, let's peek into Intel's instruction
|
||||
// reference manual. That's what description of MOVD command says:
|
||||
// MOVD xmm, r/m32 (opcode 66 0F 6E /r)
|
||||
// MOVD r/m32, xmm (opcode 66 0F 7E /r)
|
||||
// MOVQ xmm, r/m64 (opcode 66 REX.W 0F 6E /r)
|
||||
// MOVQ r/m64, xmm (opcode 66 REX.W 0F 7E /r)
|
||||
#define SSE2_MOVQ "movd"
|
||||
#else
|
||||
#define SSE2_MOVQ "movq"
|
||||
#endif // defined(__APPLE_CC__)
|
||||
|
||||
#endif // defined(__GNUC__)
|
||||
|
||||
|
||||
// Define compiler-specific macros that were not set yet.
|
||||
#if !defined(_MSC_VER) && !defined(__forceinline)
|
||||
#define __forceinline inline
|
||||
#endif // !defined(_MSC_VER) && !defined(__forceinline)
|
||||
|
||||
#if !defined(GCC_OMIT_FRAME_POINTER)
|
||||
#define GCC_OMIT_FRAME_POINTER
|
||||
#endif // !defined(GCC_OMIT_FRAME_POINTER)
|
||||
|
||||
#if !defined(GCC_ALIGN_ATTRIBUTE)
|
||||
#define GCC_ALIGN_ATTRIBUTE(n)
|
||||
#endif // !defined(GCC_ALIGN_ATTRIBUTE)
|
||||
|
||||
|
||||
#endif // CRCUTIL_PLATFORM_H_
|
||||
61
crcutil-1.0/code/protected_crc.h
Normal file
61
crcutil-1.0/code/protected_crc.h
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Protects CRC tables with its own CRC.
|
||||
// CRC tables get corrupted too, and if corruption is
|
||||
// not caught, data poisoning becomes a reality.
|
||||
|
||||
#ifndef CRCUTIL_PROTECTED_CRC_H_
|
||||
#define CRCUTIL_PROTECTED_CRC_H_
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#pragma pack(push, 16)
|
||||
|
||||
// Class CrcImplementation should not have virtual functions:
|
||||
// vptr is stored as the very first field, vptr value is defined
|
||||
// at runtime, so it is impossible to CRC(*this) once and
|
||||
// guarantee that this value will not change from run to run.
|
||||
//
|
||||
template<typename CrcImplementation> class ProtectedCrc
|
||||
: public CrcImplementation {
|
||||
public:
|
||||
typedef typename CrcImplementation::Crc Crc;
|
||||
|
||||
// Returns check value that the caller should compare
|
||||
// against pre-computed, trusted constant.
|
||||
//
|
||||
// Computing SelfCheckValue() after CRC initialization,
|
||||
// storing it in memory, and periodically checking against
|
||||
// stored value may not work: if CRC tables were initialized
|
||||
// incorrectly and/or had been corrupted during initialization,
|
||||
// CheckValue() will return garbage. Garbage in, garbage out.
|
||||
// Consequitive checks will not detect a problem, the application
|
||||
// will happily produce and save the data with corrupt CRC.
|
||||
//
|
||||
// The application should call SelfCheckValue() regularly:
|
||||
// 1. First and foremost, on every CRC mismatch.
|
||||
// 2. After CRC'ing the data but before sending it out or writing it.
|
||||
// 3. Worst case, every Nth CRC'ed byte or every Nth call to CRC.
|
||||
//
|
||||
Crc SelfCheckValue() const {
|
||||
return CrcDefault(this, sizeof(*this), 0);
|
||||
}
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_PROTECTED_CRC_H_
|
||||
106
crcutil-1.0/code/rolling_crc.h
Normal file
106
crcutil-1.0/code/rolling_crc.h
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements rolling CRC (e.g. for Rabin fingerprinting).
|
||||
|
||||
#ifndef CRCUTIL_ROLLING_CRC_H_
|
||||
#define CRCUTIL_ROLLING_CRC_H_
|
||||
|
||||
#include "base_types.h" // size_t, uint8
|
||||
#include "crc_casts.h" // TO_BYTE
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
#pragma pack(push, 16)
|
||||
|
||||
// CrcImplementation should provide:
|
||||
// - typename Crc
|
||||
// - typename TableEntry
|
||||
// - typename Word
|
||||
// - Crc CrcDefault(const void *data, size_t bytes, const Crc &start)
|
||||
// - const GfUtil<Crc> &Base() const
|
||||
template<typename CrcImplementation> class RollingCrc {
|
||||
public:
|
||||
typedef typename CrcImplementation::Crc Crc;
|
||||
typedef typename CrcImplementation::TableEntry TableEntry;
|
||||
typedef typename CrcImplementation::Word Word;
|
||||
|
||||
RollingCrc() {}
|
||||
|
||||
// Initializes internal data structures.
|
||||
// Retains reference to "crc" instance -- it is used by Start().
|
||||
RollingCrc(const CrcImplementation &crc,
|
||||
size_t roll_window_bytes,
|
||||
const Crc &start_value) {
|
||||
Init(crc, roll_window_bytes, start_value);
|
||||
}
|
||||
|
||||
// Computes crc of "roll_window_bytes" using
|
||||
// "start_value" of "crc" (see Init()).
|
||||
Crc Start(const void *data) const {
|
||||
return crc_->CrcDefault(data, roll_window_bytes_, start_value_);
|
||||
}
|
||||
|
||||
// Computes CRC of "roll_window_bytes" starting in next position.
|
||||
Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const {
|
||||
return (old_crc >> 8) ^ in_[TO_BYTE(old_crc) ^ byte_in] ^ out_[byte_out];
|
||||
}
|
||||
|
||||
// Initializes internal data structures.
|
||||
// Retains reference to "crc" instance -- it is used by Start().
|
||||
void Init(const CrcImplementation &crc,
|
||||
size_t roll_window_bytes,
|
||||
const Crc &start_value) {
|
||||
crc_ = &crc;
|
||||
roll_window_bytes_ = roll_window_bytes;
|
||||
start_value_ = start_value;
|
||||
|
||||
Crc add = crc.Base().Canonize() ^ start_value;
|
||||
add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
|
||||
add ^= crc.Base().Canonize();
|
||||
Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
|
||||
add = crc.Base().Multiply(add, mul);
|
||||
|
||||
mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
out_[i] = static_cast<TableEntry>(
|
||||
crc.Base().MultiplyUnnormalized(
|
||||
static_cast<Crc>(i), 8, mul) ^ add);
|
||||
}
|
||||
for (size_t i = 0; i < 256; ++i) {
|
||||
in_[i] = crc.crc_word_[sizeof(Word) - 1][i];
|
||||
}
|
||||
}
|
||||
|
||||
// Returns start value.
|
||||
Crc StartValue() const { return start_value_; }
|
||||
|
||||
// Returns length of roll window.
|
||||
size_t WindowBytes() const { return roll_window_bytes_; }
|
||||
|
||||
protected:
|
||||
TableEntry in_[256];
|
||||
TableEntry out_[256];
|
||||
|
||||
// Used only by Start().
|
||||
Crc start_value_;
|
||||
const CrcImplementation *crc_;
|
||||
size_t roll_window_bytes_;
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // CRCUTIL_ROLLING_CRC_H_
|
||||
51
crcutil-1.0/code/std_headers.h
Normal file
51
crcutil-1.0/code/std_headers.h
Normal file
@@ -0,0 +1,51 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Includes some standard C headers for size_t, memset, etc.
|
||||
//
|
||||
// Also, permanently disables a number of warnings produced
|
||||
// by Microsoft's compiler when it includes standard headers
|
||||
// (surprisingly, also by Microsoft).
|
||||
|
||||
#ifndef CRCUTIL_STD_HEADERS_H_
|
||||
#define CRCUTIL_STD_HEADERS_H_
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// '4' bytes padding added after data member ...
|
||||
#pragma warning(disable:4820)
|
||||
|
||||
// unreferenced inline function has been removed ...
|
||||
#pragma warning(disable:4514)
|
||||
|
||||
// conditional expression is constant
|
||||
#pragma warning(disable: 4127)
|
||||
|
||||
// function ... not inlined
|
||||
#pragma warning(disable: 4710)
|
||||
|
||||
// function ... selected for automatic inline expansion
|
||||
#pragma warning(disable: 4711)
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
|
||||
#endif // defined(_MSC_VER)
|
||||
|
||||
// #define _CSTDLIB_
|
||||
#include <stdio.h> // always handy
|
||||
#include <string.h> // memset
|
||||
#include <stdlib.h> // size_t, _rotl/_rotl64(MSC)
|
||||
#include <stddef.h> // ptrdiff_t (GNUC)
|
||||
#include <stdarg.h> // va_list
|
||||
|
||||
#endif // CRCUTIL_STD_HEADERS_H_
|
||||
310
crcutil-1.0/code/uint128_sse2.h
Normal file
310
crcutil-1.0/code/uint128_sse2.h
Normal file
@@ -0,0 +1,310 @@
|
||||
// Copyright 2010 Google Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Implements a limited set of 128-bit arithmetic operations
|
||||
// (the ones that are used by CRC) using SSE2 intrinsics.
|
||||
|
||||
#ifndef CRCUTIL_UINT128_SSE2_H_
|
||||
#define CRCUTIL_UINT128_SSE2_H_
|
||||
|
||||
#include "base_types.h"
|
||||
#include "crc_casts.h" // Downcast, CrcFromUint64, Uint64FromCrc
|
||||
#include "platform.h"
|
||||
|
||||
#if HAVE_SSE2
|
||||
|
||||
namespace crcutil {
|
||||
|
||||
// Specialized functions handling __m128i.
|
||||
template<> __forceinline uint64 Downcast(const __m128i &value) {
|
||||
#if HAVE_AMD64 && defined(__GNUC__)
|
||||
// GCC 4.4.x is too smart and, instead of MOVQ, generates SSE4 PEXTRQ
|
||||
// instruction when the code is compiled with -mmsse4.
|
||||
// Fixed in 4.5 which generates conversion through memory (why?).
|
||||
// And -- yes, it makes quite measurable difference.
|
||||
uint64 temp;
|
||||
asm(SSE2_MOVQ " %[i128], %[u64]\n" : [u64] "=r" (temp) : [i128] "x" (value));
|
||||
return temp;
|
||||
#elif HAVE_AMD64 && (!defined(_MSC_FULL_VER) || _MSC_FULL_VER > 150030729)
|
||||
return static_cast<uint64>(_mm_cvtsi128_si64(value));
|
||||
#else
|
||||
// 64-bit CL 15.00.30729.1 -O2 generates incorrect code (tests fail).
|
||||
// _mm_cvtsi128_si64() is not available on i386.
|
||||
uint64 temp;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(&temp), value);
|
||||
return temp;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
class uint128_sse2 {
|
||||
public:
|
||||
uint128_sse2() {}
|
||||
~uint128_sse2() {}
|
||||
|
||||
// Default casts to uint128_sse2 and assignment operator.
|
||||
__forceinline void operator =(uint64 value) {
|
||||
#if HAVE_AMD64 && defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 5)
|
||||
// Prevent generation of SSE4 pinsrq insruction when
|
||||
// compiling with GCC 4.4.x with -msse4 flag.
|
||||
asm(SSE2_MOVQ " %[u64], %[i128]\n" : [i128] "=x" (x_) : [u64] "r" (value));
|
||||
#elif HAVE_AMD64
|
||||
x_ = _mm_cvtsi64_si128(static_cast<int64>(value));
|
||||
#else
|
||||
x_ = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&value));
|
||||
#endif
|
||||
}
|
||||
__forceinline uint128_sse2(uint64 x) {
|
||||
*this = x;
|
||||
}
|
||||
__forceinline uint128_sse2(const __m128i x) : x_(x) {
|
||||
}
|
||||
__forceinline operator __m128i() const {
|
||||
return x_;
|
||||
}
|
||||
__forceinline void operator =(const uint128_sse2 &x) {
|
||||
x_ = x.x_;
|
||||
}
|
||||
|
||||
// Extracts 64 less significant bits.
|
||||
__forceinline uint64 to_uint64() const {
|
||||
return Downcast<__m128i, uint64>(x_);
|
||||
}
|
||||
|
||||
// Comparisons.
|
||||
__forceinline bool operator ==(const uint128_sse2 &y) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} t;
|
||||
t.i128 = _mm_xor_si128(x_, y.x_);
|
||||
return (t.u64[0] | t.u64[1]) == 0;
|
||||
}
|
||||
__forceinline bool operator ==(uint64 value) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} t;
|
||||
t.i128 = x_;
|
||||
return (t.u64[0] == value && t.u64[1] == 0);
|
||||
}
|
||||
__forceinline bool operator !=(const uint128_sse2 &y) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} t;
|
||||
t.i128 = _mm_xor_si128(x_, y.x_);
|
||||
return (t.u64[0] | t.u64[1]) != 0;
|
||||
}
|
||||
__forceinline bool operator !=(uint64 value) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} t;
|
||||
t.i128 = x_;
|
||||
return (t.u64[0] != value || t.u64[1] != 0);
|
||||
}
|
||||
|
||||
__forceinline bool operator <(const uint128_sse2 &y) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} xx, yy;
|
||||
xx.i128 = x_;
|
||||
yy.i128 = y.x_;
|
||||
return (xx.u64[0] < yy.u64[0] ||
|
||||
(xx.u64[0] == yy.u64[0] && xx.u64[1] < yy.u64[1]));
|
||||
}
|
||||
|
||||
// Bitwise logic operators.
|
||||
__forceinline uint128_sse2 operator ^(const uint128_sse2 &y) const {
|
||||
return _mm_xor_si128(x_, y.x_);
|
||||
}
|
||||
__forceinline uint128_sse2 operator &(const uint128_sse2 &y) const {
|
||||
return _mm_and_si128(x_, y.x_);
|
||||
}
|
||||
__forceinline uint128_sse2 operator |(const uint128_sse2 &y) const {
|
||||
return _mm_or_si128(x_, y.x_);
|
||||
}
|
||||
|
||||
__forceinline void operator ^=(const uint128_sse2 &y) {
|
||||
*this = *this ^ y.x_;
|
||||
}
|
||||
__forceinline void operator &=(const uint128_sse2 &y) {
|
||||
*this = *this & y.x_;
|
||||
}
|
||||
__forceinline void operator |=(const uint128_sse2 &y) {
|
||||
*this = *this | y.x_;
|
||||
}
|
||||
|
||||
// Arithmetic operators.
|
||||
__forceinline uint128_sse2 operator +(uint64 y) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} temp;
|
||||
temp.i128 = x_;
|
||||
// a + b >= 2**64 iff
|
||||
// a + b > (2**64 - 1) iff
|
||||
// a > (2**64 - 1) - b iff
|
||||
// a > ~b
|
||||
if (temp.u64[0] > ~y) {
|
||||
temp.u64[1] += 1;
|
||||
}
|
||||
temp.u64[0] += y;
|
||||
return temp.i128;
|
||||
}
|
||||
__forceinline void operator +=(uint64 x) {
|
||||
*this = *this + x;
|
||||
}
|
||||
__forceinline uint128_sse2 operator -(uint64 y) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} temp;
|
||||
temp.i128 = x_;
|
||||
if (temp.u64[0] < y) {
|
||||
temp.u64[1] -= 1;
|
||||
}
|
||||
temp.u64[0] -= y;
|
||||
return temp.i128;
|
||||
}
|
||||
__forceinline void operator -=(uint64 x) {
|
||||
*this = *this - x;
|
||||
}
|
||||
|
||||
// Bitwise logical shifts.
|
||||
__forceinline uint128_sse2 operator >>(const int bits) const {
|
||||
if (bits == 8) {
|
||||
return _mm_srli_si128(x_, 1);
|
||||
} else if (bits == 16) {
|
||||
return _mm_srli_si128(x_, 2);
|
||||
} else if (bits == 32) {
|
||||
return _mm_srli_si128(x_, 4);
|
||||
} else if (bits == 64) {
|
||||
return _mm_srli_si128(x_, 8);
|
||||
} else {
|
||||
return long_shift_right(bits);
|
||||
}
|
||||
}
|
||||
__forceinline uint128_sse2 operator >>(const size_t bits) const {
|
||||
return *this >> static_cast<int>(bits);
|
||||
}
|
||||
__forceinline void operator >>=(const int bits) {
|
||||
*this = *this >> bits;
|
||||
}
|
||||
__forceinline void operator >>=(const size_t bits) {
|
||||
*this = *this >> static_cast<int>(bits);
|
||||
}
|
||||
|
||||
__forceinline uint128_sse2 operator <<(int bits) const {
|
||||
if (bits == 8) {
|
||||
return _mm_slli_si128(x_, 1);
|
||||
} else if (bits == 16) {
|
||||
return _mm_slli_si128(x_, 2);
|
||||
} else if (bits == 32) {
|
||||
return _mm_slli_si128(x_, 4);
|
||||
} else if (bits == 64) {
|
||||
return _mm_slli_si128(x_, 8);
|
||||
} else {
|
||||
return long_shift_left(bits);
|
||||
}
|
||||
}
|
||||
__forceinline uint128_sse2 operator <<(size_t bits) const {
|
||||
return *this << static_cast<int>(bits);
|
||||
}
|
||||
__forceinline void operator <<=(int bits) {
|
||||
*this = *this << bits;
|
||||
}
|
||||
__forceinline void operator <<=(size_t bits) {
|
||||
*this = *this << static_cast<int>(bits);
|
||||
}
|
||||
|
||||
protected:
|
||||
__forceinline uint128_sse2 long_shift_right(int bits) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} x;
|
||||
x.i128 = x_;
|
||||
for (; bits > 0; --bits) {
|
||||
x.u64[0] >>= 1;
|
||||
if (x.u64[1] & 1) {
|
||||
x.u64[0] |= static_cast<uint64>(1) << 63;
|
||||
}
|
||||
x.u64[1] >>= 1;
|
||||
}
|
||||
return x.i128;
|
||||
}
|
||||
|
||||
__forceinline uint128_sse2 long_shift_left(int bits) const {
|
||||
union {
|
||||
__m128i i128;
|
||||
int64 i64[2];
|
||||
} x;
|
||||
x.i128 = x_;
|
||||
for (; bits > 0; --bits) {
|
||||
x.i64[1] <<= 1;
|
||||
if (x.i64[0] < 0) {
|
||||
x.i64[1] |= 1;
|
||||
}
|
||||
x.i64[0] <<= 1;
|
||||
}
|
||||
return x.i128;
|
||||
}
|
||||
|
||||
__m128i x_;
|
||||
} GCC_ALIGN_ATTRIBUTE(16);
|
||||
|
||||
|
||||
// Specialized versions.
|
||||
template<> __forceinline uint64 Downcast(const uint128_sse2 &x) {
|
||||
return x.to_uint64();
|
||||
}
|
||||
template<> __forceinline uint32 Downcast(const uint128_sse2 &x) {
|
||||
return static_cast<uint32>(x.to_uint64());
|
||||
}
|
||||
template<> __forceinline uint16 Downcast(const uint128_sse2 &x) {
|
||||
return static_cast<uint16>(x.to_uint64());
|
||||
}
|
||||
template<> __forceinline uint8 Downcast(const uint128_sse2 &x) {
|
||||
return static_cast<uint8>(x.to_uint64());
|
||||
}
|
||||
|
||||
template<> __forceinline uint128_sse2 CrcFromUint64(uint64 lo, uint64 hi) {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} temp;
|
||||
temp.u64[0] = lo;
|
||||
temp.u64[1] = hi;
|
||||
return temp.i128;
|
||||
}
|
||||
|
||||
template<> __forceinline void Uint64FromCrc(const uint128_sse2 &crc,
|
||||
uint64 *lo, uint64 *hi) {
|
||||
union {
|
||||
__m128i i128;
|
||||
uint64 u64[2];
|
||||
} temp;
|
||||
temp.i128 = crc;
|
||||
*lo = temp.u64[0];
|
||||
*hi = temp.u64[1];
|
||||
}
|
||||
|
||||
} // namespace crcutil
|
||||
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#endif // CRCUTIL_UINT128_SSE2_H_
|
||||
Reference in New Issue
Block a user