1*4882a593Smuzhiyun/* GPL HEADER START 2*4882a593Smuzhiyun * 3*4882a593Smuzhiyun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify 6*4882a593Smuzhiyun * it under the terms of the GNU General Public License version 2 only, 7*4882a593Smuzhiyun * as published by the Free Software Foundation. 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * This program is distributed in the hope that it will be useful, but 10*4882a593Smuzhiyun * WITHOUT ANY WARRANTY; without even the implied warranty of 11*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12*4882a593Smuzhiyun * General Public License version 2 for more details (a copy is included 13*4882a593Smuzhiyun * in the LICENSE file that accompanied this code). 14*4882a593Smuzhiyun * 15*4882a593Smuzhiyun * You should have received a copy of the GNU General Public License 16*4882a593Smuzhiyun * version 2 along with this program; If not, see http://www.gnu.org/licenses 17*4882a593Smuzhiyun * 18*4882a593Smuzhiyun * Please visit http://www.xyratex.com/contact if you need additional 19*4882a593Smuzhiyun * information or have any questions. 20*4882a593Smuzhiyun * 21*4882a593Smuzhiyun * GPL HEADER END 22*4882a593Smuzhiyun */ 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun/* 25*4882a593Smuzhiyun * Copyright 2012 Xyratex Technology Limited 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 28*4882a593Smuzhiyun * calculation. 29*4882a593Smuzhiyun * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 30*4882a593Smuzhiyun * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 31*4882a593Smuzhiyun * at: 32*4882a593Smuzhiyun * http://www.intel.com/products/processor/manuals/ 33*4882a593Smuzhiyun * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 34*4882a593Smuzhiyun * Volume 2B: Instruction Set Reference, N-Z 35*4882a593Smuzhiyun * 36*4882a593Smuzhiyun * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 37*4882a593Smuzhiyun * Alexander Boyko <Alexander_Boyko@xyratex.com> 38*4882a593Smuzhiyun */ 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun#include <linux/linkage.h> 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun.section .rodata 44*4882a593Smuzhiyun.align 16 45*4882a593Smuzhiyun/* 46*4882a593Smuzhiyun * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 47*4882a593Smuzhiyun * #define CONSTANT_R1 0x154442bd4LL 48*4882a593Smuzhiyun * 49*4882a593Smuzhiyun * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 50*4882a593Smuzhiyun * #define CONSTANT_R2 0x1c6e41596LL 51*4882a593Smuzhiyun */ 52*4882a593Smuzhiyun.Lconstant_R2R1: 53*4882a593Smuzhiyun .octa 0x00000001c6e415960000000154442bd4 54*4882a593Smuzhiyun/* 55*4882a593Smuzhiyun * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 56*4882a593Smuzhiyun * #define CONSTANT_R3 0x1751997d0LL 57*4882a593Smuzhiyun * 58*4882a593Smuzhiyun * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 59*4882a593Smuzhiyun * #define CONSTANT_R4 0x0ccaa009eLL 60*4882a593Smuzhiyun */ 61*4882a593Smuzhiyun.Lconstant_R4R3: 62*4882a593Smuzhiyun .octa 0x00000000ccaa009e00000001751997d0 63*4882a593Smuzhiyun/* 64*4882a593Smuzhiyun * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 65*4882a593Smuzhiyun * #define CONSTANT_R5 0x163cd6124LL 66*4882a593Smuzhiyun */ 67*4882a593Smuzhiyun.Lconstant_R5: 68*4882a593Smuzhiyun .octa 0x00000000000000000000000163cd6124 69*4882a593Smuzhiyun.Lconstant_mask32: 70*4882a593Smuzhiyun .octa 0x000000000000000000000000FFFFFFFF 71*4882a593Smuzhiyun/* 72*4882a593Smuzhiyun * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 73*4882a593Smuzhiyun * 74*4882a593Smuzhiyun * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 75*4882a593Smuzhiyun * #define CONSTANT_RU 0x1F7011641LL 76*4882a593Smuzhiyun */ 77*4882a593Smuzhiyun.Lconstant_RUpoly: 78*4882a593Smuzhiyun .octa 0x00000001F701164100000001DB710641 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun#define CONSTANT %xmm0 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun#ifdef __x86_64__ 83*4882a593Smuzhiyun#define BUF %rdi 84*4882a593Smuzhiyun#define LEN %rsi 85*4882a593Smuzhiyun#define CRC %edx 86*4882a593Smuzhiyun#else 87*4882a593Smuzhiyun#define BUF %eax 88*4882a593Smuzhiyun#define LEN %edx 89*4882a593Smuzhiyun#define CRC %ecx 90*4882a593Smuzhiyun#endif 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun.text 95*4882a593Smuzhiyun/** 96*4882a593Smuzhiyun * Calculate crc32 97*4882a593Smuzhiyun * BUF - buffer (16 bytes aligned) 98*4882a593Smuzhiyun * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 99*4882a593Smuzhiyun * CRC - initial crc32 100*4882a593Smuzhiyun * return %eax crc32 101*4882a593Smuzhiyun * uint crc32_pclmul_le_16(unsigned char const *buffer, 102*4882a593Smuzhiyun * size_t len, uint crc32) 103*4882a593Smuzhiyun */ 104*4882a593Smuzhiyun 105*4882a593SmuzhiyunSYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 106*4882a593Smuzhiyun movdqa (BUF), %xmm1 107*4882a593Smuzhiyun movdqa 0x10(BUF), %xmm2 108*4882a593Smuzhiyun movdqa 0x20(BUF), %xmm3 109*4882a593Smuzhiyun movdqa 0x30(BUF), %xmm4 110*4882a593Smuzhiyun movd CRC, CONSTANT 111*4882a593Smuzhiyun pxor CONSTANT, %xmm1 112*4882a593Smuzhiyun sub $0x40, LEN 113*4882a593Smuzhiyun add $0x40, BUF 114*4882a593Smuzhiyun cmp $0x40, LEN 115*4882a593Smuzhiyun jb less_64 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun#ifdef __x86_64__ 118*4882a593Smuzhiyun movdqa .Lconstant_R2R1(%rip), CONSTANT 119*4882a593Smuzhiyun#else 120*4882a593Smuzhiyun movdqa .Lconstant_R2R1, CONSTANT 121*4882a593Smuzhiyun#endif 122*4882a593Smuzhiyun 123*4882a593Smuzhiyunloop_64:/* 64 bytes Full cache line folding */ 124*4882a593Smuzhiyun prefetchnta 0x40(BUF) 125*4882a593Smuzhiyun movdqa %xmm1, %xmm5 126*4882a593Smuzhiyun movdqa %xmm2, %xmm6 127*4882a593Smuzhiyun movdqa %xmm3, %xmm7 128*4882a593Smuzhiyun#ifdef __x86_64__ 129*4882a593Smuzhiyun movdqa %xmm4, %xmm8 130*4882a593Smuzhiyun#endif 131*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 132*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm2 133*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm3 134*4882a593Smuzhiyun#ifdef __x86_64__ 135*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm4 136*4882a593Smuzhiyun#endif 137*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 138*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm6 139*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm7 140*4882a593Smuzhiyun#ifdef __x86_64__ 141*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm8 142*4882a593Smuzhiyun#endif 143*4882a593Smuzhiyun pxor %xmm5, %xmm1 144*4882a593Smuzhiyun pxor %xmm6, %xmm2 145*4882a593Smuzhiyun pxor %xmm7, %xmm3 146*4882a593Smuzhiyun#ifdef __x86_64__ 147*4882a593Smuzhiyun pxor %xmm8, %xmm4 148*4882a593Smuzhiyun#else 149*4882a593Smuzhiyun /* xmm8 unsupported for x32 */ 150*4882a593Smuzhiyun movdqa %xmm4, %xmm5 151*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm4 152*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 153*4882a593Smuzhiyun pxor %xmm5, %xmm4 154*4882a593Smuzhiyun#endif 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun pxor (BUF), %xmm1 157*4882a593Smuzhiyun pxor 0x10(BUF), %xmm2 158*4882a593Smuzhiyun pxor 0x20(BUF), %xmm3 159*4882a593Smuzhiyun pxor 0x30(BUF), %xmm4 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun sub $0x40, LEN 162*4882a593Smuzhiyun add $0x40, BUF 163*4882a593Smuzhiyun cmp $0x40, LEN 164*4882a593Smuzhiyun jge loop_64 165*4882a593Smuzhiyunless_64:/* Folding cache line into 128bit */ 166*4882a593Smuzhiyun#ifdef __x86_64__ 167*4882a593Smuzhiyun movdqa .Lconstant_R4R3(%rip), CONSTANT 168*4882a593Smuzhiyun#else 169*4882a593Smuzhiyun movdqa .Lconstant_R4R3, CONSTANT 170*4882a593Smuzhiyun#endif 171*4882a593Smuzhiyun prefetchnta (BUF) 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun movdqa %xmm1, %xmm5 174*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 175*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 176*4882a593Smuzhiyun pxor %xmm5, %xmm1 177*4882a593Smuzhiyun pxor %xmm2, %xmm1 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun movdqa %xmm1, %xmm5 180*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 181*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 182*4882a593Smuzhiyun pxor %xmm5, %xmm1 183*4882a593Smuzhiyun pxor %xmm3, %xmm1 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun movdqa %xmm1, %xmm5 186*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 187*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 188*4882a593Smuzhiyun pxor %xmm5, %xmm1 189*4882a593Smuzhiyun pxor %xmm4, %xmm1 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun cmp $0x10, LEN 192*4882a593Smuzhiyun jb fold_64 193*4882a593Smuzhiyunloop_16:/* Folding rest buffer into 128bit */ 194*4882a593Smuzhiyun movdqa %xmm1, %xmm5 195*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 196*4882a593Smuzhiyun pclmulqdq $0x11, CONSTANT, %xmm5 197*4882a593Smuzhiyun pxor %xmm5, %xmm1 198*4882a593Smuzhiyun pxor (BUF), %xmm1 199*4882a593Smuzhiyun sub $0x10, LEN 200*4882a593Smuzhiyun add $0x10, BUF 201*4882a593Smuzhiyun cmp $0x10, LEN 202*4882a593Smuzhiyun jge loop_16 203*4882a593Smuzhiyun 204*4882a593Smuzhiyunfold_64: 205*4882a593Smuzhiyun /* perform the last 64 bit fold, also adds 32 zeroes 206*4882a593Smuzhiyun * to the input stream */ 207*4882a593Smuzhiyun pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 208*4882a593Smuzhiyun psrldq $0x08, %xmm1 209*4882a593Smuzhiyun pxor CONSTANT, %xmm1 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun /* final 32-bit fold */ 212*4882a593Smuzhiyun movdqa %xmm1, %xmm2 213*4882a593Smuzhiyun#ifdef __x86_64__ 214*4882a593Smuzhiyun movdqa .Lconstant_R5(%rip), CONSTANT 215*4882a593Smuzhiyun movdqa .Lconstant_mask32(%rip), %xmm3 216*4882a593Smuzhiyun#else 217*4882a593Smuzhiyun movdqa .Lconstant_R5, CONSTANT 218*4882a593Smuzhiyun movdqa .Lconstant_mask32, %xmm3 219*4882a593Smuzhiyun#endif 220*4882a593Smuzhiyun psrldq $0x04, %xmm2 221*4882a593Smuzhiyun pand %xmm3, %xmm1 222*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 223*4882a593Smuzhiyun pxor %xmm2, %xmm1 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 226*4882a593Smuzhiyun#ifdef __x86_64__ 227*4882a593Smuzhiyun movdqa .Lconstant_RUpoly(%rip), CONSTANT 228*4882a593Smuzhiyun#else 229*4882a593Smuzhiyun movdqa .Lconstant_RUpoly, CONSTANT 230*4882a593Smuzhiyun#endif 231*4882a593Smuzhiyun movdqa %xmm1, %xmm2 232*4882a593Smuzhiyun pand %xmm3, %xmm1 233*4882a593Smuzhiyun pclmulqdq $0x10, CONSTANT, %xmm1 234*4882a593Smuzhiyun pand %xmm3, %xmm1 235*4882a593Smuzhiyun pclmulqdq $0x00, CONSTANT, %xmm1 236*4882a593Smuzhiyun pxor %xmm2, %xmm1 237*4882a593Smuzhiyun pextrd $0x01, %xmm1, %eax 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun RET 240*4882a593SmuzhiyunSYM_FUNC_END(crc32_pclmul_le_16) 241