mirror of
https://github.com/claunia/findcrcs.git
synced 2025-12-16 10:44:25 +00:00
305 lines
7.3 KiB
C++
305 lines
7.3 KiB
C++
// Copyright 2010 Google Inc. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Implements 64-bit multiword CRC for Microsoft and Intel compilers
|
|
// using MMX instructions (i386).
|
|
|
|
#include "generic_crc.h"
|
|
|
|
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
|
|
|
|
namespace crcutil {
|
|
|
|
#define CRC_WORD_MMX() \
|
|
__asm pxor BUF0, CRC0 \
|
|
__asm movd TMP0, BUF0 \
|
|
__asm psrlq BUF0, 32 \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm movq CRC0, [TABLE + TEMP * 8] \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
|
|
__asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
|
|
__asm movd TMP0, BUF0 \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
|
|
__asm movzx TEMP, TMP0L \
|
|
__asm shr TMP0, 8 \
|
|
__asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
|
|
__asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
|
|
|
|
// frame pointer register 'ebp' modified by inline assembly code
|
|
#pragma warning(disable: 4731)
|
|
|
|
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
|
|
const void *data,
|
|
size_t bytes,
|
|
const uint64 &start) const {
|
|
const uint8 *src = static_cast<const uint8 *>(data);
|
|
const uint8 *end = src + bytes;
|
|
uint64 crc0 = start ^ this->Base().Canonize();
|
|
|
|
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
|
|
if (src >= end) {
|
|
return (crc0 ^ this->Base().Canonize());
|
|
}
|
|
|
|
#define CRC0 mm0
|
|
#define CRC1 mm1
|
|
#define CRC2 mm2
|
|
#define CRC3 mm3
|
|
#define BUF0 mm4
|
|
#define BUF1 mm5
|
|
#define BUF2 mm6
|
|
#define BUF3 mm7
|
|
#define TMP0 eax
|
|
#define TMP0L al
|
|
#define TMP0H ah
|
|
#define TMP1 ebx
|
|
#define TMP1L bl
|
|
#define TMP1H bh
|
|
#define TMP2 ecx
|
|
#define TMP2L cl
|
|
#define TMP2H ch
|
|
#define TMP3 edx
|
|
#define TMP3L dl
|
|
#define TMP3H dh
|
|
#define TEMP edi
|
|
#define SRC esi
|
|
#define END [esp]
|
|
#define TABLE ebp
|
|
|
|
|
|
const uint64 *interleaved_table_address =
|
|
&this->crc_word_interleaved_[0][0];
|
|
const uint64 *word_table_address = &this->crc_word_[0][0];
|
|
|
|
__asm {
|
|
push ebp
|
|
|
|
mov TMP0, interleaved_table_address
|
|
|
|
movq CRC0, crc0
|
|
mov SRC, src
|
|
mov TMP1, end
|
|
sub TMP1, 2*4*8 - 1
|
|
cmp SRC, TMP1
|
|
mov TABLE, word_table_address
|
|
jae end_main_loop
|
|
|
|
push TABLE
|
|
mov TABLE, TMP0
|
|
push TMP1
|
|
|
|
pxor CRC1, CRC1
|
|
pxor CRC2, CRC2
|
|
pxor CRC3, CRC3
|
|
|
|
movq BUF0, [SRC]
|
|
movq BUF1, [SRC + 1 * 8]
|
|
movq BUF2, [SRC + 2 * 8]
|
|
movq BUF3, [SRC + 3 * 8]
|
|
|
|
main_loop:
|
|
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
|
|
prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
|
|
#endif
|
|
add SRC, 32
|
|
pxor BUF0, CRC0
|
|
pxor BUF1, CRC1
|
|
pxor BUF2, CRC2
|
|
pxor BUF3, CRC3
|
|
|
|
movd TMP0, BUF0
|
|
psrlq BUF0, 32
|
|
movd TMP1, BUF1
|
|
psrlq BUF1, 32
|
|
movd TMP2, BUF2
|
|
psrlq BUF2, 32
|
|
movd TMP3, BUF3
|
|
psrlq BUF3, 32
|
|
|
|
movzx TEMP, TMP0L
|
|
movq CRC0, [TABLE + TEMP * 8]
|
|
movzx TEMP, TMP1L
|
|
movq CRC1, [TABLE + TEMP * 8]
|
|
movzx TEMP, TMP2L
|
|
movq CRC2, [TABLE + TEMP * 8]
|
|
movzx TEMP, TMP3L
|
|
movq CRC3, [TABLE + TEMP * 8]
|
|
|
|
movzx TEMP, TMP0H
|
|
shr TMP0, 16
|
|
pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
|
movzx TEMP, TMP1H
|
|
shr TMP1, 16
|
|
pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
|
movzx TEMP, TMP2H
|
|
shr TMP2, 16
|
|
pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
|
movzx TEMP, TMP3H
|
|
shr TMP3, 16
|
|
pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
|
|
|
|
movzx TEMP, TMP0L
|
|
shr TMP0, 8
|
|
pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
|
movzx TEMP, TMP1L
|
|
shr TMP1, 8
|
|
pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
|
movzx TEMP, TMP2L
|
|
shr TMP2, 8
|
|
pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
|
movzx TEMP, TMP3L
|
|
shr TMP3, 8
|
|
pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
|
|
|
|
pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
|
|
movd TMP0, BUF0
|
|
pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
|
|
movd TMP1, BUF1
|
|
pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
|
|
movd TMP2, BUF2
|
|
pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
|
|
movd TMP3, BUF3
|
|
|
|
movzx TEMP, TMP0L
|
|
pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
|
movzx TEMP, TMP1L
|
|
pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
|
movzx TEMP, TMP2L
|
|
pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
|
movzx TEMP, TMP3L
|
|
pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
|
|
|
|
movzx TEMP, TMP0H
|
|
shr TMP0, 16
|
|
pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
|
movzx TEMP, TMP1H
|
|
shr TMP1, 16
|
|
pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
|
movzx TEMP, TMP2H
|
|
shr TMP2, 16
|
|
pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
|
movzx TEMP, TMP3H
|
|
shr TMP3, 16
|
|
pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
|
|
|
|
movzx TEMP, TMP0L
|
|
shr TMP0, 8
|
|
pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
|
movzx TEMP, TMP1L
|
|
shr TMP1, 8
|
|
pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
|
movzx TEMP, TMP2L
|
|
shr TMP2, 8
|
|
pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
|
movzx TEMP, TMP3L
|
|
shr TMP3, 8
|
|
pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
|
|
|
|
pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
|
|
movq BUF0, [SRC]
|
|
pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
|
|
movq BUF1, [SRC + 1 * 8]
|
|
pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
|
|
movq BUF2, [SRC + 2 * 8]
|
|
pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
|
|
movq BUF3, [SRC + 3 * 8]
|
|
|
|
cmp END, SRC
|
|
ja main_loop
|
|
|
|
#undef END
|
|
#define END TMP1
|
|
pop END
|
|
pop TABLE
|
|
add SRC, 32
|
|
|
|
CRC_WORD_MMX()
|
|
|
|
pxor BUF1, CRC1
|
|
movq BUF0, BUF1
|
|
CRC_WORD_MMX()
|
|
|
|
pxor BUF2, CRC2
|
|
movq BUF0, BUF2
|
|
CRC_WORD_MMX()
|
|
|
|
pxor BUF3, CRC3
|
|
movq BUF0, BUF3
|
|
CRC_WORD_MMX()
|
|
|
|
end_main_loop:
|
|
add END, 2*4*8 - 8
|
|
cmp SRC, END
|
|
jae end_word_loop
|
|
|
|
word_loop:
|
|
movq BUF0, [SRC]
|
|
add SRC, 8
|
|
CRC_WORD_MMX()
|
|
cmp END, SRC
|
|
ja word_loop
|
|
end_word_loop:
|
|
|
|
#if 0 // Plain C version is faster?
|
|
add END, 7
|
|
cmp SRC, END
|
|
jae end_byte_loop
|
|
|
|
byte_loop:
|
|
movd TMP0, CRC0
|
|
movzx TEMP, byte ptr [SRC]
|
|
movzx TMP0, TMP0L
|
|
psrlq CRC0, 8
|
|
xor TEMP, TMP0
|
|
add SRC, 1
|
|
pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
|
|
cmp END, SRC
|
|
ja byte_loop
|
|
end_byte_loop:
|
|
#endif
|
|
|
|
pop ebp
|
|
|
|
mov src, SRC
|
|
movq crc0, CRC0
|
|
|
|
emms
|
|
}
|
|
|
|
#if 1
|
|
// Compute CRC of remaining bytes.
|
|
for (;src < end; ++src) {
|
|
CRC_BYTE(this, crc0, *src);
|
|
}
|
|
#endif
|
|
|
|
return (crc0 ^ this->Base().Canonize());
|
|
}
|
|
|
|
|
|
} // namespace crcutil
|
|
|
|
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
|