Added version 0.2 from V.

This commit is contained in:
2015-05-24 23:22:36 +01:00
commit b5e1635fe3
74 changed files with 24753 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Defines 8/16/32/64-bit integer types.
//
// Either uint64 or uint32 will map to size_t.
// This way, specialized variants of CRC implementation
// parameterized by "size_t" will be reused when
// parameterized by "uint64" or "uint32".
// In their turn, specialized verisons are parameterized
// by "size_t" so that one version of the code is optimal
// both on 32-bit and 64-bit platforms.
#ifndef CRCUTIL_BASE_TYPES_H_
#define CRCUTIL_BASE_TYPES_H_
#include "std_headers.h" // size_t, ptrdiff_t
namespace crcutil {
template<typename A, typename B> class ChooseFirstIfSame {
public:
template<bool same_size, typename AA, typename BB> class ChooseFirstIfTrue {
public:
typedef AA Type;
};
template<typename AA, typename BB> class ChooseFirstIfTrue<false, AA, BB> {
public:
typedef BB Type;
};
typedef typename ChooseFirstIfTrue<sizeof(A) == sizeof(B), A, B>::Type Type;
};
typedef unsigned char uint8;
typedef signed char int8;
typedef unsigned short uint16;
typedef short int16;
typedef ChooseFirstIfSame<size_t, unsigned int>::Type uint32;
typedef ChooseFirstIfSame<ptrdiff_t, int>::Type int32;
#if defined(_MSC_VER)
typedef ChooseFirstIfSame<size_t, unsigned __int64>::Type uint64;
typedef ChooseFirstIfSame<ptrdiff_t, __int64>::Type int64;
#define HAVE_UINT64 1
#elif defined(__GNUC__)
typedef ChooseFirstIfSame<size_t, unsigned long long>::Type uint64;
typedef ChooseFirstIfSame<ptrdiff_t, long long>::Type int64;
#define HAVE_UINT64 1
#else
// TODO: ensure that everything compiles and works when HAVE_UINT64 is false.
// TODO: remove HAVE_UINT64 and use sizeof(uint64) instead?
#define HAVE_UINT64 0
typedef uint32 uint64;
typedef int32 int64;
#endif
} // namespace crcutil
#endif // CRCUTIL_BASE_TYPES_H_

View File

@@ -0,0 +1,366 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements CRC32C using Intel's SSE4 crc32 instruction.
// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
#include "crc32c_sse4.h"
#if HAVE_I386 || HAVE_AMD64
namespace crcutil {
#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
CRC_UPDATE_WORD(crc0, \
reinterpret_cast<const size_t *>(src + \
0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
CRC_UPDATE_WORD(crc1, \
reinterpret_cast<const size_t *>(src + \
1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
CRC_UPDATE_WORD(crc2, \
reinterpret_cast<const size_t *>(src + \
2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
if (num_stripes > 3) { \
CRC_UPDATE_WORD(crc3, \
reinterpret_cast<const size_t *>(src + \
3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
} \
} while (0)
// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)"
// using appropriate multiplication table(s).
//
#if 0
// This variant is for illustration purposes only.
// Actual implementation below:
// 1. Splits the computation into 2 data-independent paths
// by independently multiplying lower and upper halves
// of "crc0" in interleaved manner, and combining the
// results in the end.
// 2. Removing redundant "crc0 = 0" etc. in the beginning.
// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
size_t tmp0 = crc0; \
crc0 = 0; \
for (size_t i = 0; i < kNumTables; ++i) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
} \
} while (0)
#else
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
size_t tmp0 = crc0; \
size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[0][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
tmp1 >>= kTableEntryBits; \
for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
tmp1 >>= kTableEntryBits; \
} \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
if (kNumTables & 1) { \
tmp0 >>= kTableEntryBits; \
} \
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTables - 1][tmp1]; \
if (kNumTables & 1) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
} \
crc0 ^= crc1; \
} while (0)
#endif
// Given CRCs (crc0, crc1, etc.) of consequitive
// stripes of STRIPE_SIZE(block_size) bytes each,
// produces CRC of concatenated stripes.
#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc1; \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc2; \
if (num_stripes > 3) { \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc3; \
} \
} while (0)
// Processes input BLOCK_SIZE(block) bytes per iteration
// by splitting a block of BLOCK_SIZE(block) bytes into N
// equally-sized stripes of STRIPE_SIZE(block_size) each,
// computing CRC of each stripe, and concatenating stripe CRCs.
#define PROCESS_BLOCK(block_size, num_stripes) do { \
while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
Crc crc1 = 0; \
Crc crc2 = 0; \
Crc crc3; \
if (num_stripes > 3) crc3 = 0; \
{ \
const uint8 *stripe_end = src + \
(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
kUnrolledLoopBytes) * kUnrolledLoopBytes; \
do { \
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
src += kUnrolledLoopBytes; \
} while (src < stripe_end); \
if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
kUnrolledLoopBytes) != 0) { \
stripe_end += \
CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
kUnrolledLoopBytes; \
do { \
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
src += sizeof(size_t); \
} while (src < stripe_end); \
} \
} \
COMBINE_STRIPE_CRCS(block_size, num_stripes); \
src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
((num_stripes) - 1); \
bytes = static_cast<size_t>(end - src); \
} \
no_more_##block_size##_##num_stripes:; \
} while (0)
size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
crc0 ^= Base().Canonize();
// If we don't have too much data to process,
// do not waste time trying to align input etc.
// Noticeably improves performance on small inputs.
if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
goto no_more_##block_size##_##num_stripes; \
} \
} while (0)
#define NOOP(block_size, num_stripes)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
NOOP,
NOOP);
#undef PROCESS_TAIL_IF_SMALL
// Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
// 1. It uses CRC_BYTE() which won't work.
// 2. Its threshold may be incorrect becuase Crc32 that uses
// native CPU crc32 instruction is much faster than
// generic table-based CRC computation.
//
// In case of X5550 CPU, break even point is at 2KB -- exactly.
if (bytes >= 2 * 1024) {
while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
CRC_UPDATE_BYTE(crc0, src[0]);
src += 1;
}
bytes = static_cast<size_t>(end - src);
}
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
// Quickly skip processing of too large blocks
// Noticeably improves performance on small inputs.
#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
goto no_more_##block_size##_##num_stripes; \
} \
} while (0)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
SKIP_BLOCK_IF_NEEDED,
SKIP_BLOCK_IF_NEEDED);
#undef SKIP_BLOCK_IF_NEEDED
// Process data in all blocks.
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
PROCESS_BLOCK,
PROCESS_BLOCK);
// Finish the tail word-by-word and then byte-by-byte.
#define CRC_UPDATE_WORD_4(index) do { \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
} while (0)
if (bytes >= 4 * 4 * sizeof(size_t)) {
end -= 4 * 4 * sizeof(size_t);
do {
CRC_UPDATE_WORD_4(4 * 0);
CRC_UPDATE_WORD_4(4 * 1);
CRC_UPDATE_WORD_4(4 * 2);
CRC_UPDATE_WORD_4(4 * 3);
src += 4 * 4 * sizeof(size_t);
} while (src <= end);
end += 4 * 4 * sizeof(size_t);
bytes = static_cast<size_t>(end - src);
}
less_than_16_size_t:
if (bytes >= 4 * 2 * sizeof(size_t)) {
CRC_UPDATE_WORD_4(4 * 0);
CRC_UPDATE_WORD_4(4 * 1);
src += 4 * 2 * sizeof(size_t);
bytes -= 4 * 2 * sizeof(size_t);
}
less_than_8_size_t:
if (bytes >= 4 * sizeof(size_t)) {
CRC_UPDATE_WORD_4(0);
src += 4 * sizeof(size_t);
bytes -= 4 * sizeof(size_t);
}
less_than_4_size_t:
if (bytes >= 1 * sizeof(size_t)) {
end -= 1 * sizeof(size_t);
do {
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
src += 1 * sizeof(size_t);
} while (src <= end);
end += 1 * sizeof(size_t);
}
while (src < end) {
CRC_UPDATE_BYTE(crc0, src[0]);
src += 1;
}
return (crc0 ^ Base().Canonize());
}
void Crc32cSSE4::Init(bool constant) {
base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
#define INIT_MUL_TABLE(block_size, num_stripes) do { \
size_t multiplier = \
Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
for (size_t table = 0; table < kNumTables; ++table) { \
for (size_t entry = 0; entry < kTableEntries; ++entry) { \
size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
static_cast<Entry>(Base().Multiply(value, multiplier)); \
} \
} \
} while (0)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
#undef INIT_MUL_TABLE
#if !CRCUTIL_USE_MM_CRC32
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
for (size_t i = 0; i < 256; ++i) {
crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
}
}
#endif // !CRCUTIL_USE_MM_CRC32
}
bool Crc32cSSE4::IsSSE42Available() {
#if defined(_MSC_VER)
int cpu_info[4];
__cpuid(cpu_info, 1);
return ((cpu_info[3] & (1 << 20)) != 0);
#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
// Not using "cpuid.h" intentionally: it is missing from
// too many installations.
uint32 eax;
uint32 ecx;
uint32 edx;
__asm__ volatile(
#if HAVE_I386 && defined(__PIC__)
"push ebx\n"
"cpuid\n"
"pop ebx\n"
#else
"cpuid\n"
#endif // HAVE_I386 && defined(__PIC__)
: "=a" (eax), "=c" (ecx), "=d" (edx)
: "a" (1), "2" (0)
: "%ebx"
);
return ((ecx & (1 << 20)) != 0);
#else
return false;
#endif
}
void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
size_t roll_window_bytes,
const Crc &start_value) {
crc_ = &crc;
roll_window_bytes_ = roll_window_bytes;
start_value_ = start_value;
Crc add = crc.Base().Canonize() ^ start_value;
add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
add ^= crc.Base().Canonize();
Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
add = crc.Base().Multiply(add, mul);
mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
for (size_t i = 0; i < 256; ++i) {
out_[i] = static_cast<Entry>(
crc.Base().MultiplyUnnormalized(
static_cast<Crc>(i), 8, mul) ^ add);
}
#if !CRCUTIL_USE_MM_CRC32
memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
#endif // !CRCUTIL_USE_MM_CRC32
}
} // namespace crcutil
#endif // HAVE_I386 || HAVE_AMD64

View File

@@ -0,0 +1,252 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements CRC32C using Intel's SSE4 crc32 instruction.
// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
#ifndef CRCUTIL_CRC32C_SSE4_H_
#define CRCUTIL_CRC32C_SSE4_H_
#include "gf_util.h" // base types, gf_util class, etc.
#include "crc32c_sse4_intrin.h" // _mm_crc32_u* intrinsics
#if HAVE_I386 || HAVE_AMD64
#if CRCUTIL_USE_MM_CRC32
#if HAVE_I386
#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u32(crc, (value)))
#else
#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u64(crc, (value)))
#endif // HAVE_I386
#define CRC_UPDATE_BYTE(crc, value) \
(crc = _mm_crc32_u8(static_cast<uint32>(crc), static_cast<uint8>(value)))
#else
#include "generic_crc.h"
#define CRC_UPDATE_WORD(crc, value) do { \
size_t buf = (value); \
CRC_WORD(this, crc, buf); \
} while (0)
#define CRC_UPDATE_BYTE(crc, value) do { \
CRC_BYTE(this, crc, (value)); \
} while (0)
#endif // CRCUTIL_USE_MM_CRC32
namespace crcutil {
#pragma pack(push, 16)
// Since the same pieces should be parameterized in many different places
// and we do not want to introduce a mistake which is rather hard to find,
// use a macro to enumerate all block sizes.
//
// Block sizes and number of stripes were tuned for best performance.
//
// All constants should be literal constants (too lazy to fix the macro).
//
// The use of different "macro_first", "macro", and "macro_last"
// allows generation of different code for smallest, in between,
// and largest block sizes.
//
// This macro shall be kept in sync with
// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING.
// Failure to do so will cause compile-time error.
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING( \
macro_smallest, macro, macro_largest) \
macro_smallest(512, 3); \
macro(1024, 3); \
macro(4096, 3); \
macro_largest(32768, 3)
// This macro shall be kept in sync with
// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING.
// Failure to do so will cause compile-time error.
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING( \
macro_smallest, macro, macro_largest) \
macro_largest(32768, 3); \
macro(4096, 3); \
macro(1024, 3); \
macro_smallest(512, 3)
// Enumerates all block sizes.
#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(macro) \
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(macro, macro, macro)
#define CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) \
(((block_size) / (num_stripes)) & ~(sizeof(size_t) - 1))
#define CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes) \
(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * (num_stripes))
#define CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
mul_table_##block_size##_##num_blocks##_
class RollingCrc32cSSE4;
class Crc32cSSE4 {
public:
// Exports Crc, TableEntry, and Word (needed by RollingCrc).
typedef size_t Crc;
typedef Crc Word;
typedef Crc TableEntry;
Crc32cSSE4() {}
// Initializes the tables given generating polynomial of degree (degree).
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
// after actual CRC computation.
explicit Crc32cSSE4(bool canonical) {
Init(canonical);
}
void Init(bool canonical);
// Initializes the tables given generating polynomial of degree.
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
// after actual CRC computation.
// Provided for compatibility with GenericCrc.
Crc32cSSE4(const Crc &generating_polynomial,
size_t degree,
bool canonical) {
Init(generating_polynomial, degree, canonical);
}
void Init(const Crc &generating_polynomial,
size_t degree,
bool canonical) {
if (generating_polynomial == FixedGeneratingPolynomial() &&
degree == FixedDegree()) {
Init(canonical);
}
}
// Returns fixed generating polymonial the class implements.
static Crc FixedGeneratingPolynomial() {
return 0x82f63b78;
}
// Returns degree of fixed generating polymonial the class implements.
static Crc FixedDegree() {
return 32;
}
// Returns base class.
const GfUtil<Crc> &Base() const { return base_; }
// Computes CRC32.
size_t CrcDefault(const void *data, size_t bytes, const Crc &crc) const {
return Crc32c(data, bytes, crc);
}
// Returns true iff crc32 instruction is available.
static bool IsSSE42Available();
protected:
// Actual implementation.
size_t Crc32c(const void *data, size_t bytes, Crc crc) const;
enum {
kTableEntryBits = 8,
kTableEntries = 1 << kTableEntryBits,
kNumTables = (32 + kTableEntryBits - 1) / kTableEntryBits,
kNumTablesHalfLo = kNumTables / 2,
kNumTablesHalfHi = (kNumTables + 1) / 2,
kUnrolledLoopCount = 8,
kUnrolledLoopBytes = kUnrolledLoopCount * sizeof(size_t),
};
// May be set to size_t or uint32, whichever is faster.
typedef uint32 Entry;
#define DECLARE_MUL_TABLE(block_size, num_stripes) \
Entry CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTables][kTableEntries]
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(DECLARE_MUL_TABLE);
#undef DECLARE_MUL_TABLE
GfUtil<Crc> base_;
#if !CRCUTIL_USE_MM_CRC32
TableEntry crc_word_[sizeof(Word)][256];
friend class RollingCrc32cSSE4;
#endif // !CRCUTIL_USE_MM_CRC32
} GCC_ALIGN_ATTRIBUTE(16);
class RollingCrc32cSSE4 {
public:
typedef Crc32cSSE4::Crc Crc;
typedef Crc32cSSE4::TableEntry TableEntry;
typedef Crc32cSSE4::Word Word;
RollingCrc32cSSE4() {}
// Initializes internal data structures.
// Retains reference to "crc" instance -- it is used by Start().
RollingCrc32cSSE4(const Crc32cSSE4 &crc,
size_t roll_window_bytes,
const Crc &start_value) {
Init(crc, roll_window_bytes, start_value);
}
void Init(const Crc32cSSE4 &crc,
size_t roll_window_bytes,
const Crc &start_value);
// Computes crc of "roll_window_bytes" using
// "start_value" of "crc" (see Init()).
Crc Start(const void *data) const {
return crc_->CrcDefault(data, roll_window_bytes_, start_value_);
}
// Computes CRC of "roll_window_bytes" starting in next position.
Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const {
Crc crc = old_crc;
CRC_UPDATE_BYTE(crc, byte_in);
crc ^= out_[byte_out];
return crc;
}
// Returns start value.
Crc StartValue() const { return start_value_; }
// Returns length of roll window.
size_t WindowBytes() const { return roll_window_bytes_; }
protected:
typedef Crc Entry;
Entry out_[256];
// Used only by Start().
Crc start_value_;
const Crc32cSSE4 *crc_;
size_t roll_window_bytes_;
#if !CRCUTIL_USE_MM_CRC32
TableEntry crc_word_[sizeof(Word)][256];
#endif // !CRCUTIL_USE_MM_CRC32
} GCC_ALIGN_ATTRIBUTE(16);
#pragma pack(pop)
} // namespace crcutil
#endif // HAVE_I386 || HAVE_AMD64
#endif // CRCUTIL_CRC32C_SSE4_H_

View File

@@ -0,0 +1,99 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Provides _mm_crc32_u64/32/8 intrinsics.
#ifndef CRCUTIL_CRC32C_SSE4_INTRIN_H_
#define CRCUTIL_CRC32C_SSE4_INTRIN_H_
#include "platform.h"
#include "base_types.h"
#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64)
#if defined(_MSC_VER) || defined(__SSE4_2__)
#if defined(_MSC_VER)
#pragma warning(push)
// '_M_IA64' is not defined as a preprocessor macro
#pragma warning(disable: 4668)
#endif // defined(_MSC_VER)
#include <nmmintrin.h>
#if defined(_MSC_VER)
#pragma warning(pop)
#endif // defined(_MSC_VER)
#elif GCC_VERSION_AVAILABLE(4, 5) && !defined(CRCUTIL_FORCE_ASM_CRC32C)
// Allow the use of _mm_crc32_u* intrinsic when CRCUTIL_USE_MM_CRC32
// is set irrespective of "-msse*" settings. This way, the sources
// may be compiled with "-msse2 -mcrc32" and work on older CPUs,
// while taking full advantage of "crc32" instruction on newer
// CPUs (requires dynamic CPU detection). See "interface.cc".
//
// If neither -msse4 or -mcrc32 is provided and CRCUTIL_USE_MM_CRC32 is set
// and CRCUTIL_FORCE_ASM_CRC32 is not set, compile-time error will happen.
// Why? Becuase GCC disables __builtin_ia32_crc32* intrinsics when compiled
// without -msse4 or -mcrc32. -msse4 could be detected at run time by checking
// whether __SSE4_2__ is defined, but there is no way to tell whether the
// sources are compiled with -mcrc32.
extern __inline unsigned int __attribute__((
__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u8(unsigned int __C, unsigned char __V) {
return __builtin_ia32_crc32qi(__C, __V);
}
#ifdef __x86_64__
extern __inline unsigned long long __attribute__((
__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u64(unsigned long long __C, unsigned long long __V) {
return __builtin_ia32_crc32di(__C, __V);
}
#else
extern __inline unsigned int __attribute__((
__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u32(unsigned int __C, unsigned int __V) {
return __builtin_ia32_crc32si (__C, __V);
}
#endif // __x86_64__
#else
// GCC 4.4.x and earlier: use inline asm.
namespace crcutil {
__forceinline uint64 _mm_crc32_u64(uint64 crc, uint64 value) {
asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
__forceinline uint32 _mm_crc32_u32(uint32 crc, uint64 value) {
asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
__forceinline uint32 _mm_crc32_u8(uint32 crc, uint8 value) {
asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
} // namespace crcutil
#endif
#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64)
#endif // CRCUTIL_CRC32C_SSE4_INTRIN_H_

View File

@@ -0,0 +1,68 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Casting between integers and compound CRC types.
#ifndef CRCUTIL_CRC_CASTS_H_
#define CRCUTIL_CRC_CASTS_H_
#include "base_types.h" // uint8, uint64
#include "platform.h" // __forceinline
namespace crcutil {
// Downcasts a value of (oftentimes larger) Crc type to (smaller base integer)
// Result type, enabling specialized downcasts implemented by "large integer"
// classes (e.g. uint128_sse2).
template<typename Crc, typename Result>
__forceinline Result Downcast(const Crc &x) {
return static_cast<Result>(x);
}
// Extracts 8 least significant bits from a value of Crc type.
#define TO_BYTE(x) Downcast<Crc, uint8>(x)
// Converts a pair of uint64 bit values into single value of CRC type.
// It is caller's responsibility to ensure that the input is correct.
template<typename Crc>
__forceinline Crc CrcFromUint64(uint64 lo, uint64 hi = 0) {
if (sizeof(Crc) <= sizeof(lo)) {
return static_cast<Crc>(lo);
} else {
// static_cast to keep compiler happy.
Crc result = static_cast<Crc>(hi);
result = SHIFT_LEFT_SAFE(result, 8 * sizeof(lo));
result ^= lo;
return result;
}
}
// Converts Crc value to a pair of uint64 values.
template<typename Crc>
__forceinline void Uint64FromCrc(const Crc &crc,
uint64 *lo, uint64 *hi = NULL) {
if (sizeof(*lo) >= sizeof(crc)) {
*lo = Downcast<Crc, uint64>(crc);
if (hi != NULL) {
*hi = 0;
}
} else {
*lo = Downcast<Crc, uint64>(crc);
*hi = Downcast<Crc, uint64>(SHIFT_RIGHT_SAFE(crc, 8 * sizeof(lo)));
}
}
} // namespace crcutil
#endif // CRCUTIL_CRC_CASTS_H_

View File

@@ -0,0 +1,687 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Defines GenericCrc class which implements arbitrary CRCs.
//
// Please read crc.pdf to understand how it all works.
#ifndef CRCUTIL_GENERIC_CRC_H_
#define CRCUTIL_GENERIC_CRC_H_
#include "base_types.h" // uint8
#include "crc_casts.h" // TO_BYTE(), Downcast<>.
#include "gf_util.h" // GfUtil<Crc> class.
#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16)
#include "uint128_sse2.h" // uint128_sse2 type (if necessary)
namespace crcutil {
#pragma pack(push, 16)
// Extends CRC by one byte.
// Technically, if degree of a polynomial does not exceed 8,
// right shift by 8 bits is not required, but who cares about CRC-8?
#define CRC_BYTE(table, crc, byte) do { \
crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \
table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \
} while (0)
#define TABLE_ENTRY(table, byte, buf) \
table[byte][Downcast<Word, uint8>(buf)]
#define TABLE_ENTRY_LAST(table, buf) \
table[sizeof(Word) - 1][buf]
// Extends CRC by one word.
#define CRC_WORD(table, crc, buf) do { \
buf ^= Downcast<Crc, Word>(crc); \
if (sizeof(crc) > sizeof(buf)) { \
crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \
crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \
} else { \
crc = TABLE_ENTRY(table->crc_word_, 0, buf); \
} \
buf >>= 8; \
for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \
crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \
buf >>= 8; \
} \
crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \
} while (0)
// Process beginning of data block byte by byte until source pointer
// becomes perfectly aligned on Word boundary.
#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \
while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \
if (src >= end) { \
return (crc ^ table->Base().Canonize()); \
} \
CRC_BYTE(table, crc, *src); \
src += 1; \
} \
} while (0)
// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks
// but 6-10% faster on larger blocks (>= 2KB).
// Break-even point (+-1%) is around 1KB (Q9650, E6600).
//
#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \
do { \
if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \
ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \
} \
} while (0)
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 4127) // conditional expression is constant
#endif // defined(_MSC_VER)
// Forward declarations.
template<typename CrcImplementation> class RollingCrc;
// Crc is the type used internally and to return values of N-bit CRC.
// It should be at least as large as "TableEntry" and "Word" but
// may be larger (e.g. for 16-bit CRC, TableEntry and Word may be
// set to uint16 but Crc may be set to uint32).
//
// TableEntry is the type of values stored in the tables.
// To implement N-bit CRC, TableEntry should be large enough
// to store N bits.
//
// Word is the type used to read data sizeof(Word) at a time.
// Ideally, it shoulde be "most suitable for given architecture"
// integer type -- typically "size_t".
//
// kStride is the number of words processed in interleaved manner by
// CrcMultiword() and CrcWordblock(). Shall be either 3 or 4.
// Optimal value depends on hardware architecture (AMD64, ARM, etc).
//
template<typename _Crc, typename _TableEntry, typename _Word, int kStride>
class GenericCrc {
public:
// Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.)
typedef _Crc Crc;
typedef _TableEntry TableEntry;
typedef _Word Word;
GenericCrc() {}
// Initializes the tables given generating polynomial of degree.
// If "canonical" is true, crc value will be XOR'ed with (-1) before and
// after actual CRC computation.
GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) {
Init(generating_polynomial, degree, canonical);
}
void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
base_.Init(generating_polynomial, degree, canonical);
// Instead of computing
// table[j][i] = MultiplyUnnormalized(i, 8, k),
// for all i = 0...255, we may notice that
// if i = 2**n then for all m = 1...(i-1)
// MultiplyUnnormalized(i + m, 8, k) =
// MultiplyUnnormalized(i ^ m, 8, k) =
// MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) =
// MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] =
// table[i] ^ table[m].
#if 0
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
for (size_t i = 0; i < 256; ++i) {
Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp);
}
}
#else
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
TableEntry *table = this->crc_word_interleaved_[j];
table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
for (size_t i = 1; i < 256; i <<= 1) {
TableEntry value = Downcast<Crc, TableEntry>(
Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
table[i] = value;
for (size_t m = 1; m < i; ++m) {
table[i + m] = value ^ table[m];
}
}
}
#endif
#if 0
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
for (size_t i = 0; i < 256; ++i) {
Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp);
}
}
#else
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
TableEntry *table = this->crc_word_[j];
table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
for (size_t i = 1; i < 256; i <<= 1) {
TableEntry value = Downcast<Crc, TableEntry>(
Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
table[i] = value;
for (size_t m = 1; m < i; ++m) {
table[i + m] = value ^ table[m];
}
}
}
#endif
}
// Default CRC implementation
Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const {
#if HAVE_AMD64 || HAVE_I386
return CrcMultiword(data, bytes, start);
#else
// Very few CPUs have multiple ALUs and speculative execution
// (Itanium is an exception) so sophisticated algorithms will
// not perform better than good old Sarwate algorithm.
return CrcByteUnrolled(data, bytes, start);
#endif // HAVE_AMD64 || HAVE_I386
}
// Returns base class.
const GfUtil<Crc> &Base() const { return base_; }
protected:
// Canonical, byte-by-byte CRC computation.
Crc CrcByte(const void *data, size_t bytes, const Crc &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
Crc crc = start ^ Base().Canonize();
for (const uint8 *end = src + bytes; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ Base().Canonize());
}
// Byte-by-byte CRC with main loop unrolled.
Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const {
if (bytes == 0) {
return start;
}
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
Crc crc = start ^ Base().Canonize();
// Unroll loop 4 times.
end -= 3;
for (; src < end; src += 4) {
PREFETCH(src);
CRC_BYTE(this, crc, src[0]);
CRC_BYTE(this, crc, src[1]);
CRC_BYTE(this, crc, src[2]);
CRC_BYTE(this, crc, src[3]);
}
end += 3;
// Compute CRC of remaining bytes.
for (; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ Base().Canonize());
}
// Canonical, byte-by-byte CRC computation.
Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
Crc crc0 = start ^ Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc);
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
// Process 4*sizeof(Crc) bytes at a time.
end -= 4 * sizeof(Crc) - 1;
for (; src < end; src += 4 * sizeof(Crc)) {
for (size_t i = 0; i < 4; ++i) {
crc0 ^= reinterpret_cast<const Crc *>(src)[i];
if (i == 0) {
PREFETCH(src);
}
for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
CRC_BYTE(this, crc0, 0);
}
}
}
end += 4 * sizeof(Crc) - 1;
// Process sizeof(Crc) bytes at a time.
end -= sizeof(Crc) - 1;
for (; src < end; src += sizeof(Crc)) {
crc0 ^= reinterpret_cast<const Crc *>(src)[0];
for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
CRC_BYTE(this, crc0, 0);
}
}
end += sizeof(Crc) - 1;
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
return (crc0 ^ Base().Canonize());
}
// Faster, word-by-word CRC.
Crc CrcWord(const void *data, size_t bytes, const Crc &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
Crc crc0 = start ^ Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
// Process 4 sizeof(Word) bytes at once.
end -= 4 * sizeof(Word) - 1;
for (; src < end; src += 4 * sizeof(Word)) {
Word buf0 = reinterpret_cast<const Word *>(src)[0];
PREFETCH(src);
CRC_WORD(this, crc0, buf0);
buf0 = reinterpret_cast<const Word *>(src)[1];
CRC_WORD(this, crc0, buf0);
buf0 = reinterpret_cast<const Word *>(src)[2];
CRC_WORD(this, crc0, buf0);
buf0 = reinterpret_cast<const Word *>(src)[3];
CRC_WORD(this, crc0, buf0);
}
end += 4 * sizeof(Word) - 1;
// Process sizeof(Word) bytes at a time.
end -= sizeof(Word) - 1;
for (; src < end; src += sizeof(Word)) {
Word buf0 = reinterpret_cast<const Word *>(src)[0];
CRC_WORD(this, crc0, buf0);
}
end += sizeof(Word) - 1;
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
return (crc0 ^ Base().Canonize());
}
#define REPEAT_FROM_1(macro) \
macro(1); \
macro(2); \
macro(3); \
macro(4); \
macro(5); \
macro(6); \
macro(7);
#define REPEAT_FROM_0(macro) \
macro(0); \
REPEAT_FROM_1(macro)
// Faster, process adjusent blocks in parallel and concatenate CRCs.
Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const {
if (kStride < 2 || kStride > 8) {
// Unsupported configuration;
// fall back to something sensible.
return CrcWord(data, bytes, start);
}
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
Crc crc0 = start ^ Base().Canonize();
enum {
// Add 16 to avoid false L1 cache collisions.
kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1),
};
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
end -= kStride * kStripe - 1;
if (src < end) {
Crc x_pow_8kStripe = Base().Xpow8N(kStripe);
do {
const uint8 *stripe_end = src + kStripe;
#define INIT_CRC(reg) \
Crc crc##reg; \
if (kStride >= reg) { \
crc##reg = 0; \
}
REPEAT_FROM_1(INIT_CRC);
#undef INIT_CRC
do {
#define FIRST(reg) \
Word buf##reg; \
if (kStride > reg) { \
buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \
buf##reg ^= Downcast<Crc, Word>(crc##reg); \
if (sizeof(crc##reg) > sizeof(buf##reg)) { \
crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \
crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
} else { \
crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
} \
buf##reg >>= 8; \
}
REPEAT_FROM_0(FIRST);
#undef FIRST
for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) {
#define NEXT(reg) do { \
if (kStride > reg) { \
crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \
buf##reg >>= 8; \
} \
} while (0)
REPEAT_FROM_0(NEXT);
#undef NEXT
}
#define LAST(reg) do { \
if (kStride > reg) { \
crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \
} \
} while (0)
REPEAT_FROM_0(LAST);
#undef LAST
src += sizeof(Word);
} while (src < stripe_end);
#if 0
// The code is left for illustrational purposes only.
#define COMBINE(reg) do { \
if (reg > 0 && kStride > reg) { \
crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \
} \
} while (0)
#else
#define COMBINE(reg) do { \
if (reg > 0 && kStride > reg) { \
crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \
} \
} while (0)
#endif
REPEAT_FROM_0(COMBINE);
#undef COMBINE
src += (kStride - 1) * kStripe;
}
while (src < end);
}
end += kStride * kStripe - 1;
// Process sizeof(Word) bytes at a time.
end -= sizeof(Word) - 1;
for (; src < end; src += sizeof(Word)) {
Word buf0 = reinterpret_cast<const Word *>(src)[0];
CRC_WORD(this, crc0, buf0);
}
end += sizeof(Word) - 1;
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
return (crc0 ^ Base().Canonize());
}
// Fastest, interleaved multi-byte CRC.
Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const {
if (kStride < 2 || kStride > 8) {
// Unsupported configuration;
// fall back to something sensible.
return CrcWord(data, bytes, start);
}
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
Crc crc0 = start ^ Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
// Process kStride Word registers at once;
// should have have at least 2*kInterleaveBytes of data to start.
end -= 2*kInterleaveBytes - 1;
if (src < end) {
Crc crc_carryover;
if (sizeof(Crc) > sizeof(Word)) {
// crc_carryover is used if and only if Crc is wider than Word.
crc_carryover = 0;
}
#define INIT_CRC(reg) \
Crc crc##reg; \
if (reg > 0 && kStride > reg) { \
crc##reg = 0; \
}
REPEAT_FROM_1(INIT_CRC);
#undef INIT_CRC
#define INIT_BUF(reg) \
Word buf##reg; \
if (kStride > reg) { \
buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
}
REPEAT_FROM_0(INIT_BUF);
#undef INIT_BUF
do {
PREFETCH(src);
src += kInterleaveBytes;
if (sizeof(Crc) > sizeof(Word)) {
crc0 ^= crc_carryover;
}
#define FIRST(reg, next_reg) do { \
if (kStride > reg) { \
buf##reg ^= Downcast<Crc, Word>(crc##reg); \
if (sizeof(Crc) > sizeof(Word)) { \
if (reg < kStride - 1) { \
crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
} else { \
crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
} \
} \
crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \
buf##reg >>= 8; \
} \
} while (0)
FIRST(0, 1);
FIRST(1, 2);
FIRST(2, 3);
FIRST(3, 4);
FIRST(4, 5);
FIRST(5, 6);
FIRST(6, 7);
FIRST(7, 0);
#undef FIRST
for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) {
#define NEXT(reg) do { \
if (kStride > reg) { \
crc##reg ^= \
TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \
buf##reg >>= 8; \
} \
} while(0)
REPEAT_FROM_0(NEXT);
#undef NEXT
}
#define LAST(reg) do { \
if (kStride > reg) { \
crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \
buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
} \
} while(0)
REPEAT_FROM_0(LAST);
#undef LAST
}
while (src < end);
if (sizeof(Crc) > sizeof(Word)) {
crc0 ^= crc_carryover;
}
#define COMBINE(reg) do { \
if (kStride > reg) { \
if (reg != 0) { \
crc0 ^= crc##reg; \
} \
CRC_WORD(this, crc0, buf##reg); \
} \
} while (0)
REPEAT_FROM_0(COMBINE);
#undef COMBINE
src += kInterleaveBytes;
}
end += 2*kInterleaveBytes - 1;
// Process sizeof(Word) bytes at once.
end -= sizeof(Word) - 1;
for (; src < end; src += sizeof(Word)) {
Word buf0 = reinterpret_cast<const Word *>(src)[0];
CRC_WORD(this, crc0, buf0);
}
end += sizeof(Word) - 1;
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
return (crc0 ^ Base().Canonize());
}
protected:
enum {
kInterleaveBytes = sizeof(Word) * kStride,
};
// Multiplication tables used by CRCs.
TableEntry crc_word_interleaved_[sizeof(Word)][256];
TableEntry crc_word_[sizeof(Word)][256];
// Base class stored after CRC tables so that the most frequently
// used table is at offset 0 and may be accessed faster.
GfUtil<Crc> base_;
friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >;
private:
// CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is
// noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on
// hardware and compiler). However, there are problems with compilers.
//
// Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory.
//
// 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads:
// CL 15.00.307291.1 C++ >1.2< CPU cycles/byte
// ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte
// GCC 4.5 -O3 C++ 2.0 CPU cycles/byte
// GCC 4.x -O3 ASM >1.2< CPU cycles/byte
//
// 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads
// CL 15.00.307291.1 C++ 2.0 CPU cycles/byte
// GCC 4.5 -O3 C++ 1.9 CPU cycles/byte
// ICL 11.1.051 -S C++ 1.6 CPU cycles/byte
// GCC 4.x -O3 ASM >1.3< CPU cycles/byte
//
// So, use inline ASM code for GCC for both i386 and amd64.
Crc CrcMultiwordI386Mmx(
const void *data, size_t bytes, const Crc &start) const;
Crc CrcMultiwordGccAmd64(
const void *data, size_t bytes, const Crc &start) const;
Crc CrcMultiwordGccAmd64Sse2(
const uint8 *src, const uint8 *end, const Crc &start) const;
} GCC_ALIGN_ATTRIBUTE(16);
#undef REPEAT_FROM_0
#undef REPEAT_FROM_1
// Specialized variants.
#if CRCUTIL_USE_ASM
#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
// Declare specialized functions.
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
const void *data, size_t bytes, const uint64 &start) const;
#if HAVE_AMD64 && HAVE_SSE2
template<>
uint128_sse2
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
const void *data, size_t bytes, const uint128_sse2 &start) const;
#endif // HAVE_AMD64 && HAVE_SSE2
#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \
(HAVE_I386 && HAVE_MMX)
// Work around bug in MSC (present at least in v. 15.00.30729.1)
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data,
size_t bytes,
const uint64 &start) const;
template<> __forceinline
uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
const void *data,
size_t bytes,
const uint64 &start) const {
typedef uint64 Word;
typedef uint64 Crc;
if (bytes <= 12) {
const uint8 *src = static_cast<const uint8 *>(data);
uint64 crc = start ^ Base().Canonize();
for (const uint8 *end = src + bytes; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ Base().Canonize());
}
return CrcMultiwordI386Mmx(data, bytes, start);
}
#endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
#endif // CRCUTIL_USE_ASM
#pragma pack(pop)
} // namespace crcutil
#endif // CRCUTIL_GENERIC_CRC_H_

304
crcutil-1.0/code/gf_util.h Normal file
View File

@@ -0,0 +1,304 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Defines GfUtil template class which implements
// 1. some useful operations in GF(2^n),
// 2. CRC helper function (e.g. concatenation of CRCs) which are
// not affected by specific implemenation of CRC computation per se.
//
// Please read crc.pdf to understand how it all works.
#ifndef CRCUTIL_GF_UTIL_H_
#define CRCUTIL_GF_UTIL_H_
#include "base_types.h" // uint8, uint64
#include "crc_casts.h" // TO_BYTE()
#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16), SHIFT_*_SAFE
namespace crcutil {
#pragma pack(push, 16)
// "Crc" is the type used internally and to return values of N-bit CRC.
template<typename Crc> class GfUtil {
public:
// Initializes the tables given generating polynomial of degree (degree).
// If "canonical" is true, starting CRC value and computed CRC value will be
// XOR-ed with 111...111.
GfUtil() {}
GfUtil(const Crc &generating_polynomial, size_t degree, bool canonical) {
Init(generating_polynomial, degree, canonical);
}
void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
Crc one = 1;
one <<= degree - 1;
this->generating_polynomial_ = generating_polynomial;
this->crc_bytes_ = (degree + 7) >> 3;
this->degree_ = degree;
this->one_ = one;
if (canonical) {
this->canonize_ = one | (one - 1);
} else {
this->canonize_ = 0;
}
this->normalize_[0] = 0;
this->normalize_[1] = generating_polynomial;
Crc k = one >> 1;
for (size_t i = 0; i < sizeof(uint64) * 8; ++i) {
this->x_pow_2n_[i] = k;
k = Multiply(k, k);
}
this->crc_of_crc_ = Multiply(this->canonize_,
this->one_ ^ Xpow8N((degree + 7) >> 3));
FindLCD(Xpow8N(this->crc_bytes_), &this->x_pow_minus_W_);
}
// Returns generating polynomial.
Crc GeneratingPolynomial() const {
return this->generating_polynomial_;
}
// Returns number of bits in CRC (degree of generating polynomial).
size_t Degree() const {
return this->degree_;
}
// Returns start/finish adjustment constant.
Crc Canonize() const {
return this->canonize_;
}
// Returns normalized value of 1.
Crc One() const {
return this->one_;
}
// Returns value of CRC(A, |A|, start_new) given known
// crc=CRC(A, |A|, start_old) -- without touching the data.
Crc ChangeStartValue(const Crc &crc, uint64 bytes,
const Crc &start_old,
const Crc &start_new) const {
return (crc ^ Multiply(start_new ^ start_old, Xpow8N(bytes)));
}
// Returns CRC of concatenation of blocks A and B when CRCs
// of blocks A and B are known -- without touching the data.
//
// To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0),
// returns CRC(AB, |AB|, startA).
Crc Concatenate(const Crc &crc_A, const Crc &crc_B, uint64 bytes_B) const {
return ChangeStartValue(crc_B, bytes_B, 0 /* start_B */, crc_A);
}
// Returns CRC of sequence of zeroes -- without touching the data.
Crc CrcOfZeroes(uint64 bytes, const Crc &start) const {
Crc tmp = Multiply(start ^ this->canonize_, Xpow8N(bytes));
return (tmp ^ this->canonize_);
}
// Given CRC of a message, stores extra (degree + 7)/8 bytes after
// the message so that CRC(message+extra, start) = result.
// Does not change CRC start value (use ChangeStartValue for that).
// Returns number of stored bytes.
size_t StoreComplementaryCrc(void *dst,
const Crc &message_crc,
const Crc &result) const {
Crc crc0 = Multiply(result ^ this->canonize_, this->x_pow_minus_W_);
crc0 ^= message_crc ^ this->canonize_;
uint8 *d = reinterpret_cast<uint8 *>(dst);
for (size_t i = 0; i < this->crc_bytes_; ++i) {
d[i] = TO_BYTE(crc0);
crc0 >>= 8;
}
return this->crc_bytes_;
}
// Stores given CRC of a message as (degree + 7)/8 bytes filled
// with 0s to the right. Returns number of stored bytes.
// CRC of the message and stored CRC is a constant value returned
// by CrcOfCrc() -- it does not depend on contents of the message.
size_t StoreCrc(void *dst, const Crc &crc) const {
uint8 *d = reinterpret_cast<uint8 *>(dst);
Crc crc0 = crc;
for (size_t i = 0; i < this->crc_bytes_; ++i) {
d[i] = TO_BYTE(crc0);
crc0 >>= 8;
}
return this->crc_bytes_;
}
// Returns expected CRC value of CRC(Message,CRC(Message))
// when CRC is stored after the message. This value is fixed
// and does not depend on the message or CRC start value.
Crc CrcOfCrc() const {
return this->crc_of_crc_;
}
// Returns ((a * b) mod P) where "a" and "b" are of degree <= (D-1).
Crc Multiply(const Crc &aa, const Crc &bb) const {
Crc a = aa;
Crc b = bb;
if ((a ^ (a - 1)) < (b ^ (b - 1))) {
Crc temp = a;
a = b;
b = temp;
}
if (a == 0) {
return a;
}
Crc product = 0;
Crc one = this->one_;
for (; a != 0; a <<= 1) {
if ((a & one) != 0) {
product ^= b;
a ^= one;
}
b = (b >> 1) ^ this->normalize_[Downcast<Crc, size_t>(b & 1)];
}
return product;
}
// Returns ((unnorm * m) mod P) where degree of m is <= (D-1)
// and degree of value "unnorm" is provided explicitly.
Crc MultiplyUnnormalized(const Crc &unnorm, size_t degree,
const Crc &m) const {
Crc v = unnorm;
Crc result = 0;
while (degree > this->degree_) {
degree -= this->degree_;
Crc value = v & (this->one_ | (this->one_ - 1));
result ^= Multiply(value, Multiply(m, XpowN(degree)));
v >>= this->degree_;
}
result ^= Multiply(v << (this->degree_ - degree), m);
return result;
}
// returns ((x ** n) mod P).
Crc XpowN(uint64 n) const {
Crc one = this->one_;
Crc result = one;
for (size_t i = 0; n != 0; ++i, n >>= 1) {
if (n & 1) {
result = Multiply(result, this->x_pow_2n_[i]);
}
}
return result;
}
// Returns (x ** (8 * n) mod P).
Crc Xpow8N(uint64 n) const {
return XpowN(n << 3);
}
// Returns remainder (A mod B) and sets *q = (A/B) of division
// of two polynomials:
// A = dividend + dividend_x_pow_D_coef * x**degree,
// B = divisor.
Crc Divide(const Crc &dividend0, int dividend_x_pow_D_coef,
const Crc &divisor0, Crc *q) const {
Crc divisor = divisor0;
Crc dividend = dividend0;
Crc quotient = 0;
Crc coef = this->one_;
while ((divisor & 1) == 0) {
divisor >>= 1;
coef >>= 1;
}
if (dividend_x_pow_D_coef) {
quotient = coef >> 1;
dividend ^= divisor >> 1;
}
Crc x_pow_degree_b = 1;
for (;;) {
if ((dividend & x_pow_degree_b) != 0) {
dividend ^= divisor;
quotient ^= coef;
}
if (coef == this->one_) {
break;
}
coef <<= 1;
x_pow_degree_b <<= 1;
divisor <<= 1;
}
*q = quotient;
return dividend;
}
// Extended Euclid's algorith -- for given A finds LCD(A, P) and
// value B such that (A * B) mod P = LCD(A, P).
Crc FindLCD(const Crc &A, Crc *B) const {
if (A == 0 || A == this->one_) {
*B = A;
return A;
}
// Actually, generating polynomial is
// (generating_polynomial_ + x**degree).
int r0_x_pow_D_coef = 1;
Crc r0 = this->generating_polynomial_;
Crc b0 = 0;
Crc r1 = A;
Crc b1 = this->one_;
for (;;) {
Crc q;
Crc r = Divide(r0, r0_x_pow_D_coef, r1, &q);
if (r == 0) {
break;
}
r0_x_pow_D_coef = 0;
r0 = r1;
r1 = r;
Crc b = b0 ^ Multiply(q, b1);
b0 = b1;
b1 = b;
}
*B = b1;
return r1;
}
protected:
Crc canonize_;
Crc x_pow_2n_[sizeof(uint64) * 8];
Crc generating_polynomial_;
Crc one_;
Crc x_pow_minus_W_;
Crc crc_of_crc_;
Crc normalize_[2];
size_t crc_bytes_;
size_t degree_;
} GCC_ALIGN_ATTRIBUTE(16);
#pragma pack(pop)
} // namespace crcutil
#endif // CRCUTIL_GF_UTIL_H_

View File

@@ -0,0 +1,291 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements multiword CRC for GCC on i386.
//
// Small comment: the trick described in
// http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor
// (replace "movdqa dst, src" with "pshufd $0xE4, src, dst")
// did not work: execution time increased from
// 1.8 CPU cycles/byte to 2.1 CPU cycles/byte.
// So it may be good idea on P4 but it's not on newer CPUs.
//
// movaps/xorps vs. movdqa/pxor did not make any difference.
#include "generic_crc.h"
#include "uint128_sse2.h"
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2
namespace crcutil {
template<> uint128_sse2
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2(
const uint8 *src, const uint8 *end, const uint128_sse2 &start) const;
template<>
uint128_sse2 GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
const void *data, size_t bytes, const uint128_sse2 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
uint128_sse2 crc = start ^ this->Base().Canonize();
const uint8 *end = src + bytes;
if (bytes <= 7) {
for (; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ this->Base().Canonize());
}
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64);
if (src >= end) {
return (crc ^ this->Base().Canonize());
}
return CrcMultiwordGccAmd64Sse2(src, end, crc);
}
#define CRC_WORD_ASM() \
SSE2_MOVQ " %[crc0], %[tmp0]\n" \
"xorq %[tmp0], %[buf0]\n" \
"psrldq $8, %[crc0]\n" \
"movzbq %b[buf0], %[tmp0]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor (%[table_word], %[tmp0], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp1]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp1], %[tmp1]\n" \
"pxor 1*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp0]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor 2*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp1]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp1], %[tmp1]\n" \
"pxor 3*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp0]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor 4*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp1]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp1], %[tmp1]\n" \
"pxor 5*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \
"movzbq %b[buf0], %[tmp0]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor 6*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \
"addq %[buf0], %[buf0]\n" \
"pxor 7*256*16(%[table_word], %[buf0], 8), %[crc0]\n"
template<> uint128_sse2
GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2(
const uint8 *src, const uint8 *end, const uint128_sse2 &start) const {
__m128i crc0 = start;
__m128i crc1;
__m128i crc2;
__m128i crc3;
__m128i crc_carryover;
uint64 buf0;
uint64 buf1;
uint64 buf2;
uint64 buf3;
uint64 tmp0;
uint64 tmp1;
asm(
"sub $2*4*8 - 1, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 2f\n"
"pxor %[crc1], %[crc1]\n"
"pxor %[crc2], %[crc2]\n"
"pxor %[crc3], %[crc3]\n"
"pxor %[crc_carryover], %[crc_carryover]\n"
"movq (%[src]), %[buf0]\n"
"movq 1*8(%[src]), %[buf1]\n"
"movq 2*8(%[src]), %[buf2]\n"
"movq 3*8(%[src]), %[buf3]\n"
"1:\n"
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
#endif
#if GCC_VERSION_AVAILABLE(4, 5)
// Bug in GCC 4.2.4?
"add $4*8, %[src]\n"
#else
"lea 4*8(%[src]), %[src]\n"
#endif
"pxor %[crc_carryover], %[crc0]\n"
SSE2_MOVQ " %[crc0], %[tmp0]\n"
"psrldq $8, %[crc0]\n"
"xorq %[tmp0], %[buf0]\n"
"movzbq %b[buf0], %[tmp0]\n"
"pxor %[crc0], %[crc1]\n"
"addq %[tmp0], %[tmp0]\n"
"shrq $8, %[buf0]\n"
"movdqa (%[table], %[tmp0], 8), %[crc0]\n"
SSE2_MOVQ " %[crc1], %[tmp1]\n"
"psrldq $8, %[crc1]\n"
"xorq %[tmp1], %[buf1]\n"
"movzbq %b[buf1], %[tmp1]\n"
"pxor %[crc1], %[crc2]\n"
"addq %[tmp1], %[tmp1]\n"
"shrq $8, %[buf1]\n"
"movdqa (%[table], %[tmp1], 8), %[crc1]\n"
SSE2_MOVQ " %[crc2], %[tmp0]\n"
"psrldq $8, %[crc2]\n"
"xorq %[tmp0], %[buf2]\n"
"movzbq %b[buf2], %[tmp0]\n"
"pxor %[crc2], %[crc3]\n"
"addq %[tmp0], %[tmp0]\n"
"shrq $8, %[buf2]\n"
"movdqa (%[table], %[tmp0], 8), %[crc2]\n"
SSE2_MOVQ " %[crc3], %[tmp1]\n"
"psrldq $8, %[crc3]\n"
"xorq %[tmp1], %[buf3]\n"
"movzbq %b[buf3], %[tmp1]\n"
"movdqa %[crc3], %[crc_carryover]\n"
"addq %[tmp1], %[tmp1]\n"
"shrq $8, %[buf3]\n"
"movdqa (%[table], %[tmp1], 8), %[crc3]\n"
#define XOR(byte) \
"movzbq %b[buf0], %[tmp0]\n" \
"shrq $8, %[buf0]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc0]\n" \
"movzbq %b[buf1], %[tmp1]\n" \
"shrq $8, %[buf1]\n" \
"addq %[tmp1], %[tmp1]\n" \
"pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc1]\n" \
"movzbq %b[buf2], %[tmp0]\n" \
"shrq $8, %[buf2]\n" \
"addq %[tmp0], %[tmp0]\n" \
"pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc2]\n" \
"movzbq %b[buf3], %[tmp1]\n" \
"shrq $8, %[buf3]\n" \
"addq %[tmp1], %[tmp1]\n" \
"pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc3]\n"
XOR(1)
XOR(2)
XOR(3)
XOR(4)
XOR(5)
XOR(6)
#undef XOR
"addq %[buf0], %[buf0]\n"
"pxor 7*256*16(%[table], %[buf0], 8), %[crc0]\n"
"movq (%[src]), %[buf0]\n"
"addq %[buf1], %[buf1]\n"
"pxor 7*256*16(%[table], %[buf1], 8), %[crc1]\n"
"movq 1*8(%[src]), %[buf1]\n"
"addq %[buf2], %[buf2]\n"
"pxor 7*256*16(%[table], %[buf2], 8), %[crc2]\n"
"movq 2*8(%[src]), %[buf2]\n"
"addq %[buf3], %[buf3]\n"
"pxor 7*256*16(%[table], %[buf3], 8), %[crc3]\n"
"movq 3*8(%[src]), %[buf3]\n"
"cmpq %[src], %[end]\n"
"ja 1b\n"
"pxor %[crc_carryover], %[crc0]\n"
CRC_WORD_ASM()
"pxor %[crc1], %[crc0]\n"
"movq %[buf1], %[buf0]\n"
CRC_WORD_ASM()
"pxor %[crc2], %[crc0]\n"
"movq %[buf2], %[buf0]\n"
CRC_WORD_ASM()
"pxor %[crc3], %[crc0]\n"
"movq %[buf3], %[buf0]\n"
CRC_WORD_ASM()
"add $4*8, %[src]\n"
"2:\n"
"add $2*4*8 - 8, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 4f\n"
"3:\n"
"movq (%[src]), %[buf0]\n"
"addq $8, %[src]\n"
CRC_WORD_ASM()
"cmpq %[src], %[end]\n"
"ja 3b\n"
"4:\n"
"add $7, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 6f\n"
"5:\n"
"movzbq (%[src]), %[buf0]\n"
"add $1, %[src]\n"
SSE2_MOVQ " %[crc0], %[tmp0]\n"
"movzx %b[tmp0], %[tmp0]\n"
"psrldq $1, %[crc0]\n"
"xor %[buf0], %[tmp0]\n"
"addq %[tmp0], %[tmp0]\n"
"pxor 7*256*16(%[table_word], %[tmp0], 8), %[crc0]\n"
"cmpq %[src], %[end]\n"
"ja 5b\n"
"6:\n"
: // outputs
[src] "+r" (src),
[end] "+r" (end),
[crc0] "+x" (crc0),
[crc1] "=&x" (crc1),
[crc2] "=&x" (crc2),
[crc3] "=&x" (crc3),
[crc_carryover] "=&x" (crc_carryover),
[buf0] "=&r" (buf0),
[buf1] "=&r" (buf1),
[buf2] "=&r" (buf2),
[buf3] "=&r" (buf3),
[tmp0] "=&r" (tmp0),
[tmp1] "=&r" (tmp1)
: // inputs
[table_word] "r" (this->crc_word_),
[table] "r" (this->crc_word_interleaved_));
return (this->Base().Canonize() ^ crc0);
}
} // namespace crcutil
#endif // defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2

View File

@@ -0,0 +1,304 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements 64-bit multiword CRC for Microsoft and Intel compilers
// using MMX instructions (i386).
#include "generic_crc.h"
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
namespace crcutil {
#define CRC_WORD_MMX() \
__asm pxor BUF0, CRC0 \
__asm movd TMP0, BUF0 \
__asm psrlq BUF0, 32 \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm movq CRC0, [TABLE + TEMP * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
__asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
__asm movd TMP0, BUF0 \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
__asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
// frame pointer register 'ebp' modified by inline assembly code
#pragma warning(disable: 4731)
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data,
size_t bytes,
const uint64 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
uint64 crc0 = start ^ this->Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
if (src >= end) {
return (crc0 ^ this->Base().Canonize());
}
#define CRC0 mm0
#define CRC1 mm1
#define CRC2 mm2
#define CRC3 mm3
#define BUF0 mm4
#define BUF1 mm5
#define BUF2 mm6
#define BUF3 mm7
#define TMP0 eax
#define TMP0L al
#define TMP0H ah
#define TMP1 ebx
#define TMP1L bl
#define TMP1H bh
#define TMP2 ecx
#define TMP2L cl
#define TMP2H ch
#define TMP3 edx
#define TMP3L dl
#define TMP3H dh
#define TEMP edi
#define SRC esi
#define END [esp]
#define TABLE ebp
const uint64 *interleaved_table_address =
&this->crc_word_interleaved_[0][0];
const uint64 *word_table_address = &this->crc_word_[0][0];
__asm {
push ebp
mov TMP0, interleaved_table_address
movq CRC0, crc0
mov SRC, src
mov TMP1, end
sub TMP1, 2*4*8 - 1
cmp SRC, TMP1
mov TABLE, word_table_address
jae end_main_loop
push TABLE
mov TABLE, TMP0
push TMP1
pxor CRC1, CRC1
pxor CRC2, CRC2
pxor CRC3, CRC3
movq BUF0, [SRC]
movq BUF1, [SRC + 1 * 8]
movq BUF2, [SRC + 2 * 8]
movq BUF3, [SRC + 3 * 8]
main_loop:
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
#endif
add SRC, 32
pxor BUF0, CRC0
pxor BUF1, CRC1
pxor BUF2, CRC2
pxor BUF3, CRC3
movd TMP0, BUF0
psrlq BUF0, 32
movd TMP1, BUF1
psrlq BUF1, 32
movd TMP2, BUF2
psrlq BUF2, 32
movd TMP3, BUF3
psrlq BUF3, 32
movzx TEMP, TMP0L
movq CRC0, [TABLE + TEMP * 8]
movzx TEMP, TMP1L
movq CRC1, [TABLE + TEMP * 8]
movzx TEMP, TMP2L
movq CRC2, [TABLE + TEMP * 8]
movzx TEMP, TMP3L
movq CRC3, [TABLE + TEMP * 8]
movzx TEMP, TMP0H
shr TMP0, 16
pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP1H
shr TMP1, 16
pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP2H
shr TMP2, 16
pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP3H
shr TMP3, 16
pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP0L
shr TMP0, 8
pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP1L
shr TMP1, 8
pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP2L
shr TMP2, 8
pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP3L
shr TMP3, 8
pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
movd TMP0, BUF0
pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
movd TMP1, BUF1
pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
movd TMP2, BUF2
pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
movd TMP3, BUF3
movzx TEMP, TMP0L
pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP1L
pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP2L
pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP3L
pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP0H
shr TMP0, 16
pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP1H
shr TMP1, 16
pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP2H
shr TMP2, 16
pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP3H
shr TMP3, 16
pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP0L
shr TMP0, 8
pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP1L
shr TMP1, 8
pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP2L
shr TMP2, 8
pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP3L
shr TMP3, 8
pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
movq BUF0, [SRC]
pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
movq BUF1, [SRC + 1 * 8]
pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
movq BUF2, [SRC + 2 * 8]
pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
movq BUF3, [SRC + 3 * 8]
cmp END, SRC
ja main_loop
#undef END
#define END TMP1
pop END
pop TABLE
add SRC, 32
CRC_WORD_MMX()
pxor BUF1, CRC1
movq BUF0, BUF1
CRC_WORD_MMX()
pxor BUF2, CRC2
movq BUF0, BUF2
CRC_WORD_MMX()
pxor BUF3, CRC3
movq BUF0, BUF3
CRC_WORD_MMX()
end_main_loop:
add END, 2*4*8 - 8
cmp SRC, END
jae end_word_loop
word_loop:
movq BUF0, [SRC]
add SRC, 8
CRC_WORD_MMX()
cmp END, SRC
ja word_loop
end_word_loop:
#if 0 // Plain C version is faster?
add END, 7
cmp SRC, END
jae end_byte_loop
byte_loop:
movd TMP0, CRC0
movzx TEMP, byte ptr [SRC]
movzx TMP0, TMP0L
psrlq CRC0, 8
xor TEMP, TMP0
add SRC, 1
pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
cmp END, SRC
ja byte_loop
end_byte_loop:
#endif
pop ebp
mov src, SRC
movq crc0, CRC0
emms
}
#if 1
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
#endif
return (crc0 ^ this->Base().Canonize());
}
} // namespace crcutil
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)

View File

@@ -0,0 +1,298 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements multiword CRC for GCC on AMD64.
//
// Accoding to "Software Optimization Guide for AMD Family 10h Processors"
// http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/40546.pdf
// instead of
// movzbq %al, %rsi
// shrq $8, %rax
// [use %rsi]
// movzbq %al, %rsi
// shrq $8, %rax
// [use %rsi]
// it is better to use 32-bit registers
// (high 32 bits will be cleared on assignment), i.e.
// movzbl %al, %esi
// [use %rsi]
// movzbl %ah, %esi
// shrq $16, %rax
// [use %rsi]
// Makes instructions shorter and removes one shift
// (the latter is not such a big deal as it's execution time
// is nicely masked by [use %rsi] instruction).
//
// Performance difference:
// About 10% degradation on bytes = 8 .. 16
// (clobbering registers that should be saved)
// Break even at 32 bytes.
// 3% improvement starting from 64 bytes.
#include "generic_crc.h"
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64
namespace crcutil {
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64(
const void *data, size_t bytes, const uint64 &start) const;
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
const void *data,
size_t bytes,
const uint64 &start) const {
if (bytes <= 6 * sizeof(Word) - 1) {
const uint8 *src = static_cast<const uint8 *>(data);
uint64 crc = start ^ this->Base().Canonize();
const uint8 *end = src + bytes;
#define PROCESS_ONE_WORD() do { \
Word buf = reinterpret_cast<const Word *>(src)[0]; \
CRC_WORD(this, crc, buf); \
src += sizeof(Word); \
} while (0)
if (bytes >= 1 * sizeof(Word)) {
PROCESS_ONE_WORD();
if (bytes >= 2 * sizeof(Word)) {
PROCESS_ONE_WORD();
if (bytes >= 3 * sizeof(Word)) {
PROCESS_ONE_WORD();
if (bytes >= 4 * sizeof(Word)) {
PROCESS_ONE_WORD();
if (bytes >= 5 * sizeof(Word)) {
PROCESS_ONE_WORD();
}
}
}
}
}
for (; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ this->Base().Canonize());
}
return this->CrcMultiwordGccAmd64(data, bytes, start);
}
#define TMP0 "%%rsi"
#define TMP0W "%%esi"
#define BUF0 "%%rax"
#define BUF0L "%%al"
#define BUF0H "%%ah"
#define BUF1 "%%rbx"
#define BUF1L "%%bl"
#define BUF1H "%%bh"
#define BUF2 "%%rcx"
#define BUF2L "%%cl"
#define BUF2H "%%ch"
#define BUF3 "%%rdx"
#define BUF3L "%%dl"
#define BUF3H "%%dh"
#define CRC_WORD_ASM() \
"xorq %[crc0], " BUF0 "\n" \
"movzbq " BUF0L ", " TMP0 "\n" \
"movq (%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF0H ", " TMP0W "\n" \
"shrq $16, " BUF0 "\n" \
"xorq 1*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbq " BUF0L ", " TMP0 "\n" \
"xorq 2*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF0H ", " TMP0W "\n" \
"shrq $16, " BUF0 "\n" \
"xorq 3*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbq " BUF0L ", " TMP0 "\n" \
"xorq 4*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF0H ", " TMP0W "\n" \
"shrq $16, " BUF0 "\n" \
"xorq 5*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbq " BUF0L ", " TMP0 "\n" \
"xorq 6*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF0H ", " TMP0W "\n" \
"xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n"
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64(
const void *data, size_t bytes, const uint64 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
uint64 crc0 = start ^ this->Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
if (src >= end) {
return (crc0 ^ this->Base().Canonize());
}
uint64 crc1;
uint64 crc2;
uint64 crc3;
asm(
"sub $2*4*8 - 1, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 2f\n"
"xorq %[crc1], %[crc1]\n"
"movq (%[src]), " BUF0 "\n"
"movq 1*8(%[src]), " BUF1 "\n"
"movq 2*8(%[src]), " BUF2 "\n"
"movq 3*8(%[src]), " BUF3 "\n"
"movq %[crc1], %[crc2]\n"
"movq %[crc1], %[crc3]\n"
"1:\n"
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
#endif // HAVE_SSE
"add $4*8, %[src]\n"
// Set buffer data.
"xorq %[crc0], " BUF0 "\n"
"xorq %[crc1], " BUF1 "\n"
"xorq %[crc2], " BUF2 "\n"
"xorq %[crc3], " BUF3 "\n"
// LOAD crc of byte 0 and shift buffers.
"movzbl " BUF0L ", " TMP0W "\n"
"movq (%[table], " TMP0 ", 8), %[crc0]\n"
"movzbl " BUF1L ", " TMP0W "\n"
"movq (%[table], " TMP0 ", 8), %[crc1]\n"
"movzbl " BUF2L ", " TMP0W "\n"
"movq (%[table], " TMP0 ", 8), %[crc2]\n"
"movzbl " BUF3L ", " TMP0W "\n"
"movq (%[table], " TMP0 ", 8), %[crc3]\n"
#define XOR1(byte1) \
"movzbl " BUF0L ", " TMP0W "\n" \
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF1L ", " TMP0W "\n" \
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \
"movzbl " BUF2L ", " TMP0W "\n" \
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \
"movzbl " BUF3L ", " TMP0W "\n" \
"xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
#define XOR2(byte2) \
"movzbl " BUF0H ", " TMP0W "\n" \
"shrq $16, " BUF0 "\n" \
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \
"movzbl " BUF1H ", " TMP0W "\n" \
"shrq $16, " BUF1 "\n" \
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \
"movzbl " BUF2H ", " TMP0W "\n" \
"shrq $16, " BUF2 "\n" \
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \
"movzbl " BUF3H ", " TMP0W "\n" \
"shrq $16, " BUF3 "\n" \
"xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
XOR2(1)
XOR1(2)
XOR2(3)
XOR1(4)
XOR2(5)
XOR1(6)
// Update CRC registers and load buffers.
"movzbl " BUF0H ", " TMP0W "\n"
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc0]\n"
"movq (%[src]), " BUF0 "\n"
"movzbl " BUF1H ", " TMP0W "\n"
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc1]\n"
"movq 1*8(%[src]), " BUF1 "\n"
"movzbl " BUF2H ", " TMP0W "\n"
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc2]\n"
"movq 2*8(%[src]), " BUF2 "\n"
"movzbl " BUF3H ", " TMP0W "\n"
"xorq 7*256*8(%[table], " TMP0 ", 8), %[crc3]\n"
"movq 3*8(%[src]), " BUF3 "\n"
"cmpq %[src], %[end]\n"
"ja 1b\n"
CRC_WORD_ASM()
"xorq %[crc1], " BUF1 "\n"
"movq " BUF1 ", " BUF0 "\n"
CRC_WORD_ASM()
"xorq %[crc2], " BUF2 "\n"
"movq " BUF2 ", " BUF0 "\n"
CRC_WORD_ASM()
"xorq %[crc3], " BUF3 "\n"
"movq " BUF3 ", " BUF0 "\n"
CRC_WORD_ASM()
"add $4*8, %[src]\n"
"2:\n"
"add $2*4*8 - 8, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 4f\n"
"3:\n"
"movq (%[src]), " BUF0 "\n"
"add $8, %[src]\n"
CRC_WORD_ASM()
"cmpq %[src], %[end]\n"
"ja 3b\n"
"4:\n"
"add $7, %[end]\n"
"cmpq %[src], %[end]\n"
"jbe 6f\n"
"5:\n"
"movzbq (%[src]), " BUF0 "\n"
"movzbq %b[crc0], " TMP0 "\n"
"shrq $8, %[crc0]\n"
"xorq " BUF0 ", " TMP0 "\n"
"add $1, %[src]\n"
"xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n"
"cmpq %[src], %[end]\n"
"ja 5b\n"
"6:\n"
: // outputs
[src] "+r" (src),
[end] "+r" (end),
[crc0] "+r" (crc0),
[crc1] "=&r" (crc1),
[crc2] "=&r" (crc2),
[crc3] "=&r" (crc3)
: // inputs
[table] "r" (&this->crc_word_interleaved_[0][0]),
[table_word] "r" (&this->crc_word_[0][0])
: // clobbers
"%rax", // BUF0
"%rbx", // BUF1
"%rcx", // BUF2
"%rdx", // BUF3
"%rsi" // TMP0
);
return (crc0 ^ this->Base().Canonize());
}
} // namespace crcutil
#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM

View File

@@ -0,0 +1,261 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements multiword CRC for GCC on i386.
#include "generic_crc.h"
#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
namespace crcutil {
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data, size_t bytes, const uint64 &start)
const GCC_OMIT_FRAME_POINTER;
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
const void *data, size_t bytes, const uint64 &start) const {
if (bytes <= 7) {
const uint8 *src = static_cast<const uint8 *>(data);
uint64 crc = start ^ this->Base().Canonize();
for (const uint8 *end = src + bytes; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ this->Base().Canonize());
}
return CrcMultiwordI386Mmx(data, bytes, start);
}
#define CRC_WORD_MMX() \
"pxor %[crc0], %[buf0]\n" \
"movd %[buf0], %[tmp0]\n" \
"psrlq $32, %[buf0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"movq (%[table], %[temp], 8), %[crc0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor 1*256*8(%[table], %[temp], 8), %[crc0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor 2*256*8(%[table], %[temp], 8), %[crc0]\n" \
"pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" \
"movd %[buf0], %[tmp0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor 4*256*8(%[table], %[temp], 8), %[crc0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor 5*256*8(%[table], %[temp], 8), %[crc0]\n" \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor 6*256*8(%[table], %[temp], 8), %[crc0]\n" \
"pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n"
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data, size_t bytes, const uint64 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
uint64 crc0 = start ^ this->Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
if (src >= end) {
return (crc0 ^ this->Base().Canonize());
}
uint64 crc1;
uint64 crc2;
uint64 crc3;
uint64 buf0;
uint64 buf1;
uint64 buf2;
uint64 buf3;
uint32 tmp0;
uint32 tmp1;
uint32 tmp2;
uint32 tmp3;
uint32 temp;
void *table_ptr;
const uint64 *table_interleaved = &this->crc_word_interleaved_[0][0];
const uint64 *table_word = &this->crc_word_[0][0];
asm(
"sub $2*4*8 - 1, %[end]\n"
"cmpl %[src], %[end]\n"
"jbe 2f\n"
"pxor %[crc1], %[crc1]\n"
"pxor %[crc2], %[crc2]\n"
"pxor %[crc3], %[crc3]\n"
"movq (%[src]), %[buf0]\n"
"movq 1*8(%[src]), %[buf1]\n"
"movq 2*8(%[src]), %[buf2]\n"
"movq 3*8(%[src]), %[buf3]\n"
"movl %[table_interleaved], %[table]\n"
"1:\n"
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
"prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n"
#endif
"addl $0x20, %[src]\n"
"pxor %[crc0], %[buf0]\n"
"pxor %[crc1], %[buf1]\n"
"pxor %[crc2], %[buf2]\n"
"pxor %[crc3], %[buf3]\n"
"movd %[buf0], %[tmp0]\n"
"psrlq $32, %[buf0]\n"
"movd %[buf1], %[tmp1]\n"
"psrlq $32, %[buf1]\n"
"movd %[buf2], %[tmp2]\n"
"psrlq $32, %[buf2]\n"
"movd %[buf3], %[tmp3]\n"
"psrlq $32, %[buf3]\n"
"movzbl %b[tmp0], %[temp]\n"
"shrl $8, %[tmp0]\n"
"movq (%[table], %[temp], 8), %[crc0]\n"
"movzbl %b[tmp1], %[temp]\n"
"shrl $8, %[tmp1]\n"
"movq (%[table], %[temp], 8), %[crc1]\n"
"movzbl %b[tmp2], %[temp]\n"
"shrl $8, %[tmp2]\n"
"movq (%[table], %[temp], 8), %[crc2]\n"
"movzbl %b[tmp3], %[temp]\n"
"shrl $8, %[tmp3]\n"
"movq (%[table], %[temp], 8), %[crc3]\n"
#define XOR(byte) \
"movzbl %b[tmp0], %[temp]\n" \
"shrl $8, %[tmp0]\n" \
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc0]\n" \
"movzbl %b[tmp1], %[temp]\n" \
"shrl $8, %[tmp1]\n" \
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc1]\n" \
"movzbl %b[tmp2], %[temp]\n" \
"shrl $8, %[tmp2]\n" \
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc2]\n" \
"movzbl %b[tmp3], %[temp]\n" \
"shrl $8, %[tmp3]\n" \
"pxor " #byte "*256*8(%[table], %[temp], 8), %[crc3]\n"
XOR(1)
XOR(2)
"pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n"
"movd %[buf0], %[tmp0]\n"
"pxor 3*256*8(%[table], %[tmp1], 8), %[crc1]\n"
"movd %[buf1], %[tmp1]\n"
"pxor 3*256*8(%[table], %[tmp2], 8), %[crc2]\n"
"movd %[buf2], %[tmp2]\n"
"pxor 3*256*8(%[table], %[tmp3], 8), %[crc3]\n"
"movd %[buf3], %[tmp3]\n"
XOR(4)
XOR(5)
XOR(6)
"pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n"
"movq (%[src]), %[buf0]\n"
"pxor 7*256*8(%[table], %[tmp1], 8), %[crc1]\n"
"movq 1*8(%[src]), %[buf1]\n"
"pxor 7*256*8(%[table], %[tmp2], 8), %[crc2]\n"
"movq 2*8(%[src]), %[buf2]\n"
"pxor 7*256*8(%[table], %[tmp3], 8), %[crc3]\n"
"movq 3*8(%[src]), %[buf3]\n"
"cmpl %[src], %[end]\n"
"ja 1b\n"
#undef XOR
"movl %[table_word], %[table]\n"
CRC_WORD_MMX()
"pxor %[crc1], %[buf1]\n"
"movq %[buf1], %[buf0]\n"
CRC_WORD_MMX()
"pxor %[crc2], %[buf2]\n"
"movq %[buf2], %[buf0]\n"
CRC_WORD_MMX()
"pxor %[crc3], %[buf3]\n"
"movq %[buf3], %[buf0]\n"
CRC_WORD_MMX()
"add $4*8, %[src]\n"
"2:\n"
"movl %[table_word], %[table]\n"
"add $2*4*8 - 8, %[end]\n"
"cmpl %[src], %[end]\n"
"jbe 4f\n"
"3:\n"
"movq (%[src]), %[buf0]\n"
"addl $0x8, %[src]\n"
CRC_WORD_MMX()
"cmpl %[src], %[end]\n"
"ja 3b\n"
"4:\n"
"add $7, %[end]\n"
"cmpl %[src], %[end]\n"
"jbe 6f\n"
"5:\n"
"movd %[crc0], %[tmp0]\n"
"movzbl (%[src]), %[temp]\n"
"movzbl %b[tmp0], %[tmp0]\n"
"psrlq $8, %[crc0]\n"
"xorl %[tmp0], %[temp]\n"
"add $1, %[src]\n"
"pxor 7*256*8(%[table], %[temp], 8), %[crc0]\n"
"cmpl %[src], %[end]\n"
"ja 5b\n"
"6:\n"
: // outputs
[src] "+r" (src),
[end] "+m" (end),
[crc0] "+y" (crc0),
[crc1] "=&y" (crc1),
[crc2] "=&y" (crc2),
[crc3] "=&y" (crc3),
[buf0] "=&y" (buf0),
[buf1] "=&y" (buf1),
[buf2] "=&y" (buf2),
[buf3] "=&y" (buf3),
[tmp0] "=&q" (tmp0),
[tmp1] "=&q" (tmp1),
[tmp2] "=&q" (tmp2),
[tmp3] "=&q" (tmp3),
[temp] "=&r" (temp),
[table] "=&r" (table_ptr)
: // inputs
[table_interleaved] "m" (table_interleaved),
[table_word] "m" (table_word));
asm volatile("emms");
return (crc0 ^ this->Base().Canonize());
}
} // namespace crcutil
#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM

View File

@@ -0,0 +1,243 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements 64-bit multiword CRC using MMX built-in functions.
#include "generic_crc.h"
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
namespace crcutil {
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data, size_t bytes, const uint64 &start)
const GCC_OMIT_FRAME_POINTER;
#if !defined(_MSC_VER)
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
const void *data,
size_t bytes,
const uint64 &start) const {
if (bytes <= 7) {
const uint8 *src = static_cast<const uint8 *>(data);
uint64 crc = start ^ Base().Canonize();
for (const uint8 *end = src + bytes; src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ Base().Canonize());
}
return CrcMultiwordI386Mmx(data, bytes, start);
}
#else
#pragma warning(push)
// CL: uninitialized local variable 'crc1' used
// Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
#pragma warning(disable: 4700)
#pragma warning(disable: 4619) // there is no warning number '592'
// ICL: variable "crc1" is used before its value is set
// Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
#pragma warning(disable: 592)
#endif // !defined(_MSC_VER)
#define MM64(adr) reinterpret_cast<const __m64 *>(adr)
#define MM64_TABLE(byte) MM64(crc_word_interleaved_[byte])
#define CRC_WORD_MMX(this, crc, buf) do { \
buf = _mm_xor_si64(buf, crc); \
uint32 tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
buf = _mm_srli_si64(buf, 32); \
crc = MM64(crc_word_[0])[TO_BYTE(tmp)]; \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[1])[TO_BYTE(tmp)]); \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[2])[TO_BYTE(tmp)]); \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[3])[tmp]); \
tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
crc = _mm_xor_si64(crc, MM64(crc_word_[4])[TO_BYTE(tmp)]); \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[5])[TO_BYTE(tmp)]); \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[6])[TO_BYTE(tmp)]); \
tmp >>= 8; \
crc = _mm_xor_si64(crc, MM64(crc_word_[7])[tmp]); \
} while (0)
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data, size_t bytes, const uint64 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
uint64 crc = start ^ Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64);
if (src >= end) {
return (crc ^ Base().Canonize());
}
// Process 4 registers of sizeof(uint64) bytes at once.
bytes = static_cast<size_t>(end - src) & ~(4*8 - 1);
if (bytes > 4*8) {
const uint8 *stop = src + bytes - 4*8;
union {
__m64 m64;
uint64 u64;
} temp;
__m64 crc0;
__m64 crc1;
__m64 crc2;
__m64 crc3;
__m64 buf0 = MM64(src)[0];
__m64 buf1 = MM64(src)[1];
__m64 buf2 = MM64(src)[2];
__m64 buf3 = MM64(src)[3];
temp.u64 = crc;
crc0 = temp.m64;
#if defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
// There is no way to suppress a warning in GCC;
// generate extra assignments.
temp.u64 = 0;
crc1 = temp.m64;
crc2 = temp.m64;
crc3 = temp.m64;
#else
crc1 = _mm_xor_si64(crc1, crc1);
crc2 = _mm_xor_si64(crc2, crc2);
crc3 = _mm_xor_si64(crc3, crc3);
#endif // defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
do {
PREFETCH(src);
src += 4*8;
buf0 = _mm_xor_si64(buf0, crc0);
buf1 = _mm_xor_si64(buf1, crc1);
buf2 = _mm_xor_si64(buf2, crc2);
buf3 = _mm_xor_si64(buf3, crc3);
uint32 tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
uint32 tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
uint32 tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
uint32 tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
buf0 = _mm_srli_si64(buf0, 32);
buf1 = _mm_srli_si64(buf1, 32);
buf2 = _mm_srli_si64(buf2, 32);
buf3 = _mm_srli_si64(buf3, 32);
crc0 = MM64_TABLE(0)[TO_BYTE(tmp0)];
tmp0 >>= 8;
crc1 = MM64_TABLE(0)[TO_BYTE(tmp1)];
tmp1 >>= 8;
crc2 = MM64_TABLE(0)[TO_BYTE(tmp2)];
tmp2 >>= 8;
crc3 = MM64_TABLE(0)[TO_BYTE(tmp3)];
tmp3 >>= 8;
#define XOR(byte) do { \
crc0 = _mm_xor_si64(crc0, MM64_TABLE(byte)[TO_BYTE(tmp0)]); \
tmp0 >>= 8; \
crc1 = _mm_xor_si64(crc1, MM64_TABLE(byte)[TO_BYTE(tmp1)]); \
tmp1 >>= 8; \
crc2 = _mm_xor_si64(crc2, MM64_TABLE(byte)[TO_BYTE(tmp2)]); \
tmp2 >>= 8; \
crc3 = _mm_xor_si64(crc3, MM64_TABLE(byte)[TO_BYTE(tmp3)]); \
tmp3 >>= 8; \
} while (0)
XOR(1);
XOR(2);
crc0 = _mm_xor_si64(crc0, MM64_TABLE(3)[tmp0]);
tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
crc1 = _mm_xor_si64(crc1, MM64_TABLE(3)[tmp1]);
tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
crc2 = _mm_xor_si64(crc2, MM64_TABLE(3)[tmp2]);
tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
crc3 = _mm_xor_si64(crc3, MM64_TABLE(3)[tmp3]);
tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
XOR(4);
XOR(5);
XOR(6);
#undef XOR
crc0 = _mm_xor_si64(crc0, MM64_TABLE(sizeof(uint64) - 1)[tmp0]);
buf0 = MM64(src)[0];
crc1 = _mm_xor_si64(crc1, MM64_TABLE(sizeof(uint64) - 1)[tmp1]);
buf1 = MM64(src)[1];
crc2 = _mm_xor_si64(crc2, MM64_TABLE(sizeof(uint64) - 1)[tmp2]);
buf2 = MM64(src)[2];
crc3 = _mm_xor_si64(crc3, MM64_TABLE(sizeof(uint64) - 1)[tmp3]);
buf3 = MM64(src)[3];
}
while (src < stop);
CRC_WORD_MMX(this, crc0, buf0);
buf1 = _mm_xor_si64(buf1, crc1);
CRC_WORD_MMX(this, crc0, buf1);
buf2 = _mm_xor_si64(buf2, crc2);
CRC_WORD_MMX(this, crc0, buf2);
buf3 = _mm_xor_si64(buf3, crc3);
CRC_WORD_MMX(this, crc0, buf3);
temp.m64 = crc0;
crc = temp.u64;
_mm_empty();
src += 4*8;
}
// Process sizeof(uint64) bytes at once.
bytes = static_cast<size_t>(end - src) & ~(sizeof(uint64) - 1);
if (bytes > 0) {
union {
__m64 m64;
uint64 u64;
} temp;
__m64 crc0;
temp.u64 = crc;
crc0 = temp.m64;
for (const uint8 *stop = src + bytes; src < stop; src += sizeof(uint64)) {
__m64 buf0 = MM64(src)[0];
CRC_WORD_MMX(this, crc0, buf0);
}
temp.m64 = crc0;
crc = temp.u64;
_mm_empty();
}
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc, *src);
}
return (crc ^ Base().Canonize());
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif // defined(_MSC_VER)
} // namespace crcutil
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX

245
crcutil-1.0/code/platform.h Normal file
View File

@@ -0,0 +1,245 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Detects configuration and defines compiler-specific macros.
// Also, sets user-defined CRUTIL_USE_* macros to default values.
#ifndef CRCUTIL_PLATFORM_H_
#define CRCUTIL_PLATFORM_H_
// Permanently disable some annoying warnings generated
// by Microsoft CL when compiling Microsoft's headers.
#include "std_headers.h"
// Use inline asm version of the code?
#if !defined(CRCUTIL_USE_ASM)
#define CRCUTIL_USE_ASM 1
#endif // !defined(CRCUTIL_USE_ASM)
#if !defined(HAVE_I386)
#if defined(__i386__) || defined(_M_IX86)
#define HAVE_I386 1
#else
#define HAVE_I386 0
#endif // defined(__i386__) || defined(_M_IX86)
#endif // defined(HAVE_I386)
#if !defined(HAVE_AMD64)
#if defined(__amd64__) || defined(_M_AMD64)
#define HAVE_AMD64 1
#else
#define HAVE_AMD64 0
#endif // defined(__amd64__) || defined(_M_AMD64)
#endif // defined(HAVE_AMD64)
#if HAVE_AMD64 || HAVE_I386
#if defined(_MSC_VER)
#pragma warning(push)
// '_M_IX86' is not defined as a preprocessor macro
#pragma warning(disable: 4668)
#include <intrin.h>
#pragma warning(pop)
#endif // defined(_MSC_VER)
#if !defined(HAVE_MMX)
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__))
#define HAVE_MMX 1
#else
#define HAVE_MMX 0
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__))
#endif // !defined(HAVE_MMX)
#if !defined(HAVE_SSE)
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__))
#include <xmmintrin.h>
#define HAVE_SSE 1
#else
#define HAVE_SSE 0
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__))
#endif // !defined(HAVE_SSE)
#if !defined(HAVE_SSE2)
#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__))
#include <emmintrin.h>
#define HAVE_SSE2 1
#else
#define HAVE_SSE2 0
#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__))
#endif // !defined(HAVE_SSE2)
#else
#if !defined(HAVE_MMX)
#define HAVE_MMX 0
#endif // !defined(HAVE_MMX)
#if !defined(HAVE_SSE)
#define HAVE_SSE 0
#endif // !defined(HAVE_SSE)
#if !defined(HAVE_SSE2)
#define HAVE_SSE2 0
#endif // !defined(HAVE_SSE2)
#endif // HAVE_AMD64 || HAVE_I386
// Error checking
#if HAVE_SSE && !HAVE_MMX
#error SSE is available but not MMX?
#endif // HAVE_SSE && !HAVE_MMX
#if HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX)
#error SSE2 is available but not SSE or MMX?
#endif // HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX)
#if !defined(CRCUTIL_PREFETCH_WIDTH)
// On newer X5550 CPU, heavily optimized CrcMultiword is 3% faster without
// prefetch for inputs smaller than 8MB and less than 1% slower for 8MB and
// larger blocks. On older Q9650 CPU, the code is 2-3% faster for inputs
// smaller than 8MB, 4-5% slower when length >= 8MB.
// Tested with prefetch length 256, 512, and 4096.
//
// At this moment there is no compelling reason to use prefetching.
//
#define CRCUTIL_PREFETCH_WIDTH 0
#endif // !defined(CRCUTIL_PREFETCH_WIDTH)
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
#define PREFETCH(src) \
_mm_prefetch(reinterpret_cast<const char *>(src) + CRCUTIL_PREFETCH_WIDTH, \
_MM_HINT_T0)
#else
#define PREFETCH(src)
#endif // HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
// If block size exceeds CRCUTIL_MIN_ALIGN_SIZE, align the data
// before accessing it at word boundary. See generic_crc.cc,
// ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() macro.
#if !defined(CRCUTIL_MIN_ALIGN_SIZE)
#if HAVE_AMD64 || HAVE_I386
#define CRCUTIL_MIN_ALIGN_SIZE (1024)
#else
#define CRCUTIL_MIN_ALIGN_SIZE 0
#endif // HAVE_AMD64 || HAVE_I386
#endif // !defined(CRCUTIL_MIN_ALIGN_SIZE)
// Use _mm_crc32_u64/32/8 intrinics?
// If not, they will be implemented in software.
#if !HAVE_I386 && !HAVE_AMD64
#undef CRCUTIL_USE_MM_CRC32
#define CRCUTIL_USE_MM_CRC32 0
#else
#if !defined(CRCUTIL_USE_MM_CRC32)
#if defined(_MSC_VER) || defined(__GNUC__)
#define CRCUTIL_USE_MM_CRC32 1
#else
#define CRCUTIL_USE_MM_CRC32 0
#endif // defined(_MSC_VER) || defined(__GNUC__)
#endif // !defined(CRCUTIL_USE_MM_CRC32)
#endif // !HAVE_I386 && !HAVE_AMD64
// Stringize -- always handy.
#define TO_STRING_VALUE(arg) #arg
#define TO_STRING(arg) TO_STRING_VALUE(arg)
// Compilers give "right shift count >= width of type" warning even
// though the shift happens only under appropriate "if".
#define SHIFT_RIGHT_NO_WARNING(value, bits) \
((value) >> (((bits) < (8 * sizeof(value))) ? (bits) : 0))
#define SHIFT_RIGHT_SAFE(value, bits) \
((bits) < (8 * sizeof(value)) ? SHIFT_RIGHT_NO_WARNING(value, bits) : 0)
// The same for left shifts.
#define SHIFT_LEFT_NO_WARNING(value, bits) \
((value) << (((bits) < (8 * sizeof(value))) ? (bits) : 0))
#define SHIFT_LEFT_SAFE(value, bits) \
((bits) < (8 * sizeof(value)) ? SHIFT_LEFT_NO_WARNING(value, bits) : 0)
// GCC-specific macros.
//
#define GCC_VERSION_AVAILABLE(major, minor) \
(defined(__GNUC__) && \
(__GNUC__ > (major) || \
(__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
#if defined(__GNUC__)
// The GenericCrc tables must be properly aligned.
// Penalty for misalignment? 50% performance degradation.
// For 128-bit SSE2, the penalty is access violation.
#define GCC_ALIGN_ATTRIBUTE(n) __attribute__((aligned(n)))
#if GCC_VERSION_AVAILABLE(4, 4)
// If not marked as "omit frame pointer",
// GCC won't be able to find enough registers.
#define GCC_OMIT_FRAME_POINTER \
__attribute__((__optimize__(2, "omit-frame-pointer")))
#endif // GCC_VERSION_AVAILABLE(4, 4)
#if !defined(__forceinline)
#define __forceinline __attribute__((__always_inline__)) inline
#endif // !defined(__forceinline)
#if defined(__APPLE_CC__)
// The version of GCC used by Max OS X xCode v 5664 does not understand
// "movq xmm, r64" instruction and requires the use of "movd" (probably
// because of the bug in GCC which treats "movq/movd xmm,r64 or r64,xmm"
// the same).
//
// Leaving common sense aside, let's peek into Intel's instruction
// reference manual. That's what description of MOVD command says:
// MOVD xmm, r/m32 (opcode 66 0F 6E /r)
// MOVD r/m32, xmm (opcode 66 0F 7E /r)
// MOVQ xmm, r/m64 (opcode 66 REX.W 0F 6E /r)
// MOVQ r/m64, xmm (opcode 66 REX.W 0F 7E /r)
#define SSE2_MOVQ "movd"
#else
#define SSE2_MOVQ "movq"
#endif // defined(__APPLE_CC__)
#endif // defined(__GNUC__)
// Define compiler-specific macros that were not set yet.
#if !defined(_MSC_VER) && !defined(__forceinline)
#define __forceinline inline
#endif // !defined(_MSC_VER) && !defined(__forceinline)
#if !defined(GCC_OMIT_FRAME_POINTER)
#define GCC_OMIT_FRAME_POINTER
#endif // !defined(GCC_OMIT_FRAME_POINTER)
#if !defined(GCC_ALIGN_ATTRIBUTE)
#define GCC_ALIGN_ATTRIBUTE(n)
#endif // !defined(GCC_ALIGN_ATTRIBUTE)
#endif // CRCUTIL_PLATFORM_H_

View File

@@ -0,0 +1,61 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Protects CRC tables with its own CRC.
// CRC tables get corrupted too, and if corruption is
// not caught, data poisoning becomes a reality.
#ifndef CRCUTIL_PROTECTED_CRC_H_
#define CRCUTIL_PROTECTED_CRC_H_
namespace crcutil {
#pragma pack(push, 16)
// Class CrcImplementation should not have virtual functions:
// vptr is stored as the very first field, vptr value is defined
// at runtime, so it is impossible to CRC(*this) once and
// guarantee that this value will not change from run to run.
//
template<typename CrcImplementation> class ProtectedCrc
: public CrcImplementation {
public:
typedef typename CrcImplementation::Crc Crc;
// Returns check value that the caller should compare
// against pre-computed, trusted constant.
//
// Computing SelfCheckValue() after CRC initialization,
// storing it in memory, and periodically checking against
// stored value may not work: if CRC tables were initialized
// incorrectly and/or had been corrupted during initialization,
// CheckValue() will return garbage. Garbage in, garbage out.
// Consequitive checks will not detect a problem, the application
// will happily produce and save the data with corrupt CRC.
//
// The application should call SelfCheckValue() regularly:
// 1. First and foremost, on every CRC mismatch.
// 2. After CRC'ing the data but before sending it out or writing it.
// 3. Worst case, every Nth CRC'ed byte or every Nth call to CRC.
//
Crc SelfCheckValue() const {
return CrcDefault(this, sizeof(*this), 0);
}
} GCC_ALIGN_ATTRIBUTE(16);
#pragma pack(pop)
} // namespace crcutil
#endif // CRCUTIL_PROTECTED_CRC_H_

View File

@@ -0,0 +1,106 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements rolling CRC (e.g. for Rabin fingerprinting).
#ifndef CRCUTIL_ROLLING_CRC_H_
#define CRCUTIL_ROLLING_CRC_H_
#include "base_types.h" // size_t, uint8
#include "crc_casts.h" // TO_BYTE
namespace crcutil {
#pragma pack(push, 16)
// CrcImplementation should provide:
// - typename Crc
// - typename TableEntry
// - typename Word
// - Crc CrcDefault(const void *data, size_t bytes, const Crc &start)
// - const GfUtil<Crc> &Base() const
template<typename CrcImplementation> class RollingCrc {
public:
typedef typename CrcImplementation::Crc Crc;
typedef typename CrcImplementation::TableEntry TableEntry;
typedef typename CrcImplementation::Word Word;
RollingCrc() {}
// Initializes internal data structures.
// Retains reference to "crc" instance -- it is used by Start().
RollingCrc(const CrcImplementation &crc,
size_t roll_window_bytes,
const Crc &start_value) {
Init(crc, roll_window_bytes, start_value);
}
// Computes crc of "roll_window_bytes" using
// "start_value" of "crc" (see Init()).
Crc Start(const void *data) const {
return crc_->CrcDefault(data, roll_window_bytes_, start_value_);
}
// Computes CRC of "roll_window_bytes" starting in next position.
Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const {
return (old_crc >> 8) ^ in_[TO_BYTE(old_crc) ^ byte_in] ^ out_[byte_out];
}
// Initializes internal data structures.
// Retains reference to "crc" instance -- it is used by Start().
void Init(const CrcImplementation &crc,
size_t roll_window_bytes,
const Crc &start_value) {
crc_ = &crc;
roll_window_bytes_ = roll_window_bytes;
start_value_ = start_value;
Crc add = crc.Base().Canonize() ^ start_value;
add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
add ^= crc.Base().Canonize();
Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
add = crc.Base().Multiply(add, mul);
mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
for (size_t i = 0; i < 256; ++i) {
out_[i] = static_cast<TableEntry>(
crc.Base().MultiplyUnnormalized(
static_cast<Crc>(i), 8, mul) ^ add);
}
for (size_t i = 0; i < 256; ++i) {
in_[i] = crc.crc_word_[sizeof(Word) - 1][i];
}
}
// Returns start value.
Crc StartValue() const { return start_value_; }
// Returns length of roll window.
size_t WindowBytes() const { return roll_window_bytes_; }
protected:
TableEntry in_[256];
TableEntry out_[256];
// Used only by Start().
Crc start_value_;
const CrcImplementation *crc_;
size_t roll_window_bytes_;
} GCC_ALIGN_ATTRIBUTE(16);
#pragma pack(pop)
} // namespace crcutil
#endif // CRCUTIL_ROLLING_CRC_H_

View File

@@ -0,0 +1,51 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Includes some standard C headers for size_t, memset, etc.
//
// Also, permanently disables a number of warnings produced
// by Microsoft's compiler when it includes standard headers
// (surprisingly, also by Microsoft).
#ifndef CRCUTIL_STD_HEADERS_H_
#define CRCUTIL_STD_HEADERS_H_
#if defined(_MSC_VER)
// '4' bytes padding added after data member ...
#pragma warning(disable:4820)
// unreferenced inline function has been removed ...
#pragma warning(disable:4514)
// conditional expression is constant
#pragma warning(disable: 4127)
// function ... not inlined
#pragma warning(disable: 4710)
// function ... selected for automatic inline expansion
#pragma warning(disable: 4711)
#define _CRT_SECURE_NO_WARNINGS
#endif // defined(_MSC_VER)
// #define _CSTDLIB_
#include <stdio.h> // always handy
#include <string.h> // memset
#include <stdlib.h> // size_t, _rotl/_rotl64(MSC)
#include <stddef.h> // ptrdiff_t (GNUC)
#include <stdarg.h> // va_list
#endif // CRCUTIL_STD_HEADERS_H_

View File

@@ -0,0 +1,310 @@
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements a limited set of 128-bit arithmetic operations
// (the ones that are used by CRC) using SSE2 intrinsics.
#ifndef CRCUTIL_UINT128_SSE2_H_
#define CRCUTIL_UINT128_SSE2_H_
#include "base_types.h"
#include "crc_casts.h" // Downcast, CrcFromUint64, Uint64FromCrc
#include "platform.h"
#if HAVE_SSE2
namespace crcutil {
// Specialized functions handling __m128i.
template<> __forceinline uint64 Downcast(const __m128i &value) {
#if HAVE_AMD64 && defined(__GNUC__)
// GCC 4.4.x is too smart and, instead of MOVQ, generates SSE4 PEXTRQ
// instruction when the code is compiled with -mmsse4.
// Fixed in 4.5 which generates conversion through memory (why?).
// And -- yes, it makes quite measurable difference.
uint64 temp;
asm(SSE2_MOVQ " %[i128], %[u64]\n" : [u64] "=r" (temp) : [i128] "x" (value));
return temp;
#elif HAVE_AMD64 && (!defined(_MSC_FULL_VER) || _MSC_FULL_VER > 150030729)
return static_cast<uint64>(_mm_cvtsi128_si64(value));
#else
// 64-bit CL 15.00.30729.1 -O2 generates incorrect code (tests fail).
// _mm_cvtsi128_si64() is not available on i386.
uint64 temp;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&temp), value);
return temp;
#endif
}
class uint128_sse2 {
public:
uint128_sse2() {}
~uint128_sse2() {}
// Default casts to uint128_sse2 and assignment operator.
__forceinline void operator =(uint64 value) {
#if HAVE_AMD64 && defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 5)
// Prevent generation of SSE4 pinsrq insruction when
// compiling with GCC 4.4.x with -msse4 flag.
asm(SSE2_MOVQ " %[u64], %[i128]\n" : [i128] "=x" (x_) : [u64] "r" (value));
#elif HAVE_AMD64
x_ = _mm_cvtsi64_si128(static_cast<int64>(value));
#else
x_ = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&value));
#endif
}
__forceinline uint128_sse2(uint64 x) {
*this = x;
}
__forceinline uint128_sse2(const __m128i x) : x_(x) {
}
__forceinline operator __m128i() const {
return x_;
}
__forceinline void operator =(const uint128_sse2 &x) {
x_ = x.x_;
}
// Extracts 64 less significant bits.
__forceinline uint64 to_uint64() const {
return Downcast<__m128i, uint64>(x_);
}
// Comparisons.
__forceinline bool operator ==(const uint128_sse2 &y) const {
union {
__m128i i128;
uint64 u64[2];
} t;
t.i128 = _mm_xor_si128(x_, y.x_);
return (t.u64[0] | t.u64[1]) == 0;
}
__forceinline bool operator ==(uint64 value) const {
union {
__m128i i128;
uint64 u64[2];
} t;
t.i128 = x_;
return (t.u64[0] == value && t.u64[1] == 0);
}
__forceinline bool operator !=(const uint128_sse2 &y) const {
union {
__m128i i128;
uint64 u64[2];
} t;
t.i128 = _mm_xor_si128(x_, y.x_);
return (t.u64[0] | t.u64[1]) != 0;
}
__forceinline bool operator !=(uint64 value) const {
union {
__m128i i128;
uint64 u64[2];
} t;
t.i128 = x_;
return (t.u64[0] != value || t.u64[1] != 0);
}
__forceinline bool operator <(const uint128_sse2 &y) const {
union {
__m128i i128;
uint64 u64[2];
} xx, yy;
xx.i128 = x_;
yy.i128 = y.x_;
return (xx.u64[0] < yy.u64[0] ||
(xx.u64[0] == yy.u64[0] && xx.u64[1] < yy.u64[1]));
}
// Bitwise logic operators.
__forceinline uint128_sse2 operator ^(const uint128_sse2 &y) const {
return _mm_xor_si128(x_, y.x_);
}
__forceinline uint128_sse2 operator &(const uint128_sse2 &y) const {
return _mm_and_si128(x_, y.x_);
}
__forceinline uint128_sse2 operator |(const uint128_sse2 &y) const {
return _mm_or_si128(x_, y.x_);
}
__forceinline void operator ^=(const uint128_sse2 &y) {
*this = *this ^ y.x_;
}
__forceinline void operator &=(const uint128_sse2 &y) {
*this = *this & y.x_;
}
__forceinline void operator |=(const uint128_sse2 &y) {
*this = *this | y.x_;
}
// Arithmetic operators.
__forceinline uint128_sse2 operator +(uint64 y) const {
union {
__m128i i128;
uint64 u64[2];
} temp;
temp.i128 = x_;
// a + b >= 2**64 iff
// a + b > (2**64 - 1) iff
// a > (2**64 - 1) - b iff
// a > ~b
if (temp.u64[0] > ~y) {
temp.u64[1] += 1;
}
temp.u64[0] += y;
return temp.i128;
}
__forceinline void operator +=(uint64 x) {
*this = *this + x;
}
__forceinline uint128_sse2 operator -(uint64 y) const {
union {
__m128i i128;
uint64 u64[2];
} temp;
temp.i128 = x_;
if (temp.u64[0] < y) {
temp.u64[1] -= 1;
}
temp.u64[0] -= y;
return temp.i128;
}
__forceinline void operator -=(uint64 x) {
*this = *this - x;
}
// Bitwise logical shifts.
__forceinline uint128_sse2 operator >>(const int bits) const {
if (bits == 8) {
return _mm_srli_si128(x_, 1);
} else if (bits == 16) {
return _mm_srli_si128(x_, 2);
} else if (bits == 32) {
return _mm_srli_si128(x_, 4);
} else if (bits == 64) {
return _mm_srli_si128(x_, 8);
} else {
return long_shift_right(bits);
}
}
__forceinline uint128_sse2 operator >>(const size_t bits) const {
return *this >> static_cast<int>(bits);
}
__forceinline void operator >>=(const int bits) {
*this = *this >> bits;
}
__forceinline void operator >>=(const size_t bits) {
*this = *this >> static_cast<int>(bits);
}
__forceinline uint128_sse2 operator <<(int bits) const {
if (bits == 8) {
return _mm_slli_si128(x_, 1);
} else if (bits == 16) {
return _mm_slli_si128(x_, 2);
} else if (bits == 32) {
return _mm_slli_si128(x_, 4);
} else if (bits == 64) {
return _mm_slli_si128(x_, 8);
} else {
return long_shift_left(bits);
}
}
__forceinline uint128_sse2 operator <<(size_t bits) const {
return *this << static_cast<int>(bits);
}
__forceinline void operator <<=(int bits) {
*this = *this << bits;
}
__forceinline void operator <<=(size_t bits) {
*this = *this << static_cast<int>(bits);
}
protected:
__forceinline uint128_sse2 long_shift_right(int bits) const {
union {
__m128i i128;
uint64 u64[2];
} x;
x.i128 = x_;
for (; bits > 0; --bits) {
x.u64[0] >>= 1;
if (x.u64[1] & 1) {
x.u64[0] |= static_cast<uint64>(1) << 63;
}
x.u64[1] >>= 1;
}
return x.i128;
}
__forceinline uint128_sse2 long_shift_left(int bits) const {
union {
__m128i i128;
int64 i64[2];
} x;
x.i128 = x_;
for (; bits > 0; --bits) {
x.i64[1] <<= 1;
if (x.i64[0] < 0) {
x.i64[1] |= 1;
}
x.i64[0] <<= 1;
}
return x.i128;
}
__m128i x_;
} GCC_ALIGN_ATTRIBUTE(16);
// Specialized versions.
template<> __forceinline uint64 Downcast(const uint128_sse2 &x) {
return x.to_uint64();
}
template<> __forceinline uint32 Downcast(const uint128_sse2 &x) {
return static_cast<uint32>(x.to_uint64());
}
template<> __forceinline uint16 Downcast(const uint128_sse2 &x) {
return static_cast<uint16>(x.to_uint64());
}
template<> __forceinline uint8 Downcast(const uint128_sse2 &x) {
return static_cast<uint8>(x.to_uint64());
}
template<> __forceinline uint128_sse2 CrcFromUint64(uint64 lo, uint64 hi) {
union {
__m128i i128;
uint64 u64[2];
} temp;
temp.u64[0] = lo;
temp.u64[1] = hi;
return temp.i128;
}
template<> __forceinline void Uint64FromCrc(const uint128_sse2 &crc,
uint64 *lo, uint64 *hi) {
union {
__m128i i128;
uint64 u64[2];
} temp;
temp.i128 = crc;
*lo = temp.u64[0];
*hi = temp.u64[1];
}
} // namespace crcutil
#endif // HAVE_SSE2
#endif // CRCUTIL_UINT128_SSE2_H_