xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/sha256-avx2-asm.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun########################################################################
2*4882a593Smuzhiyun# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3*4882a593Smuzhiyun#
4*4882a593Smuzhiyun# Copyright (C) 2013 Intel Corporation.
5*4882a593Smuzhiyun#
6*4882a593Smuzhiyun# Authors:
7*4882a593Smuzhiyun#     James Guilford <james.guilford@intel.com>
8*4882a593Smuzhiyun#     Kirk Yap <kirk.s.yap@intel.com>
9*4882a593Smuzhiyun#     Tim Chen <tim.c.chen@linux.intel.com>
10*4882a593Smuzhiyun#
11*4882a593Smuzhiyun# This software is available to you under a choice of one of two
12*4882a593Smuzhiyun# licenses.  You may choose to be licensed under the terms of the GNU
13*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file
14*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the
15*4882a593Smuzhiyun# OpenIB.org BSD license below:
16*4882a593Smuzhiyun#
17*4882a593Smuzhiyun#     Redistribution and use in source and binary forms, with or
18*4882a593Smuzhiyun#     without modification, are permitted provided that the following
19*4882a593Smuzhiyun#     conditions are met:
20*4882a593Smuzhiyun#
21*4882a593Smuzhiyun#      - Redistributions of source code must retain the above
22*4882a593Smuzhiyun#        copyright notice, this list of conditions and the following
23*4882a593Smuzhiyun#        disclaimer.
24*4882a593Smuzhiyun#
25*4882a593Smuzhiyun#      - Redistributions in binary form must reproduce the above
26*4882a593Smuzhiyun#        copyright notice, this list of conditions and the following
27*4882a593Smuzhiyun#        disclaimer in the documentation and/or other materials
28*4882a593Smuzhiyun#        provided with the distribution.
29*4882a593Smuzhiyun#
30*4882a593Smuzhiyun# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31*4882a593Smuzhiyun# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32*4882a593Smuzhiyun# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33*4882a593Smuzhiyun# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34*4882a593Smuzhiyun# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35*4882a593Smuzhiyun# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36*4882a593Smuzhiyun# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37*4882a593Smuzhiyun# SOFTWARE.
38*4882a593Smuzhiyun#
39*4882a593Smuzhiyun########################################################################
40*4882a593Smuzhiyun#
41*4882a593Smuzhiyun# This code is described in an Intel White-Paper:
42*4882a593Smuzhiyun# "Fast SHA-256 Implementations on Intel Architecture Processors"
43*4882a593Smuzhiyun#
44*4882a593Smuzhiyun# To find it, surf to http://www.intel.com/p/en_US/embedded
45*4882a593Smuzhiyun# and search for that title.
46*4882a593Smuzhiyun#
47*4882a593Smuzhiyun########################################################################
48*4882a593Smuzhiyun# This code schedules 2 blocks at a time, with 4 lanes per block
49*4882a593Smuzhiyun########################################################################
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun#include <linux/linkage.h>
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun## assume buffers not aligned
54*4882a593Smuzhiyun#define	VMOVDQ vmovdqu
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun################################ Define Macros
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun# addm [mem], reg
59*4882a593Smuzhiyun# Add reg to mem using reg-mem add and store
60*4882a593Smuzhiyun.macro addm p1 p2
61*4882a593Smuzhiyun	add	\p1, \p2
62*4882a593Smuzhiyun	mov	\p2, \p1
63*4882a593Smuzhiyun.endm
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun################################
66*4882a593Smuzhiyun
67*4882a593SmuzhiyunX0 = %ymm4
68*4882a593SmuzhiyunX1 = %ymm5
69*4882a593SmuzhiyunX2 = %ymm6
70*4882a593SmuzhiyunX3 = %ymm7
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun# XMM versions of above
73*4882a593SmuzhiyunXWORD0 = %xmm4
74*4882a593SmuzhiyunXWORD1 = %xmm5
75*4882a593SmuzhiyunXWORD2 = %xmm6
76*4882a593SmuzhiyunXWORD3 = %xmm7
77*4882a593Smuzhiyun
78*4882a593SmuzhiyunXTMP0 = %ymm0
79*4882a593SmuzhiyunXTMP1 = %ymm1
80*4882a593SmuzhiyunXTMP2 = %ymm2
81*4882a593SmuzhiyunXTMP3 = %ymm3
82*4882a593SmuzhiyunXTMP4 = %ymm8
83*4882a593SmuzhiyunXFER  = %ymm9
84*4882a593SmuzhiyunXTMP5 = %ymm11
85*4882a593Smuzhiyun
86*4882a593SmuzhiyunSHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
87*4882a593SmuzhiyunSHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
88*4882a593SmuzhiyunBYTE_FLIP_MASK = %ymm13
89*4882a593Smuzhiyun
90*4882a593SmuzhiyunX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91*4882a593Smuzhiyun
92*4882a593SmuzhiyunNUM_BLKS = %rdx	# 3rd arg
93*4882a593SmuzhiyunINP	= %rsi  # 2nd arg
94*4882a593SmuzhiyunCTX	= %rdi	# 1st arg
95*4882a593Smuzhiyunc	= %ecx
96*4882a593Smuzhiyund	= %r8d
97*4882a593Smuzhiyune       = %edx	# clobbers NUM_BLKS
98*4882a593Smuzhiyuny3	= %esi	# clobbers INP
99*4882a593Smuzhiyun
100*4882a593SmuzhiyunSRND	= CTX	# SRND is same register as CTX
101*4882a593Smuzhiyun
102*4882a593Smuzhiyuna = %eax
103*4882a593Smuzhiyunb = %ebx
104*4882a593Smuzhiyunf = %r9d
105*4882a593Smuzhiyung = %r10d
106*4882a593Smuzhiyunh = %r11d
107*4882a593Smuzhiyunold_h = %r11d
108*4882a593Smuzhiyun
109*4882a593SmuzhiyunT1 = %r12d
110*4882a593Smuzhiyuny0 = %r13d
111*4882a593Smuzhiyuny1 = %r14d
112*4882a593Smuzhiyuny2 = %r15d
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
116*4882a593Smuzhiyun_XMM_SAVE_SIZE	= 0
117*4882a593Smuzhiyun_INP_END_SIZE	= 8
118*4882a593Smuzhiyun_INP_SIZE	= 8
119*4882a593Smuzhiyun_CTX_SIZE	= 8
120*4882a593Smuzhiyun_RSP_SIZE	= 8
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun_XFER		= 0
123*4882a593Smuzhiyun_XMM_SAVE	= _XFER     + _XFER_SIZE
124*4882a593Smuzhiyun_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125*4882a593Smuzhiyun_INP		= _INP_END  + _INP_END_SIZE
126*4882a593Smuzhiyun_CTX		= _INP      + _INP_SIZE
127*4882a593Smuzhiyun_RSP		= _CTX      + _CTX_SIZE
128*4882a593SmuzhiyunSTACK_SIZE	= _RSP      + _RSP_SIZE
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun# rotate_Xs
131*4882a593Smuzhiyun# Rotate values of symbols X0...X3
132*4882a593Smuzhiyun.macro rotate_Xs
133*4882a593Smuzhiyun	X_ = X0
134*4882a593Smuzhiyun	X0 = X1
135*4882a593Smuzhiyun	X1 = X2
136*4882a593Smuzhiyun	X2 = X3
137*4882a593Smuzhiyun	X3 = X_
138*4882a593Smuzhiyun.endm
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun# ROTATE_ARGS
141*4882a593Smuzhiyun# Rotate values of symbols a...h
142*4882a593Smuzhiyun.macro ROTATE_ARGS
143*4882a593Smuzhiyun	old_h = h
144*4882a593Smuzhiyun	TMP_ = h
145*4882a593Smuzhiyun	h = g
146*4882a593Smuzhiyun	g = f
147*4882a593Smuzhiyun	f = e
148*4882a593Smuzhiyun	e = d
149*4882a593Smuzhiyun	d = c
150*4882a593Smuzhiyun	c = b
151*4882a593Smuzhiyun	b = a
152*4882a593Smuzhiyun	a = TMP_
153*4882a593Smuzhiyun.endm
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun.macro FOUR_ROUNDS_AND_SCHED disp
156*4882a593Smuzhiyun################################### RND N + 0 ############################
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
159*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
160*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
163*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
164*4882a593Smuzhiyun	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
166*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
169*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
170*4882a593Smuzhiyun	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
171*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
174*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
175*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
176*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
179*4882a593Smuzhiyun	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
180*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
181*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
184*4882a593Smuzhiyun	vpsrld	$7, XTMP1, XTMP2
185*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
186*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
187*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
190*4882a593Smuzhiyun	vpslld	$(32-7), XTMP1, XTMP3
191*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
192*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
195*4882a593Smuzhiyun	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun	vpsrld	$18, XTMP1, XTMP2
198*4882a593Smuzhiyun	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
199*4882a593Smuzhiyun	add	y3, h		# h = t1 + S0 + MAJ                     # --
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun	ROTATE_ARGS
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun################################### RND N + 1 ############################
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
207*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
208*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
209*4882a593Smuzhiyun	offset = \disp + 1*4
210*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
211*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
216*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
217*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
218*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
222*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
223*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
224*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
225*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	vpslld	$(32-18), XTMP1, XTMP1
228*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
229*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun	vpxor	XTMP1, XTMP3, XTMP3
232*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
233*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
237*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
238*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
239*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
242*4882a593Smuzhiyun	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
243*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
244*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
247*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
248*4882a593Smuzhiyun	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
249*4882a593Smuzhiyun	add	y3, h		# h = t1 + S0 + MAJ                     # --
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun	ROTATE_ARGS
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun################################### RND N + 2 ############################
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
259*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
260*4882a593Smuzhiyun	offset = \disp + 2*4
261*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
265*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
266*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
267*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
270*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
271*4882a593Smuzhiyun	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
272*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
275*4882a593Smuzhiyun	vpxor	XTMP3, XTMP2, XTMP2
276*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
277*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
280*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
281*4882a593Smuzhiyun	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
282*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
285*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
286*4882a593Smuzhiyun	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
287*4882a593Smuzhiyun	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
290*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
291*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
292*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
293*4882a593Smuzhiyun	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
296*4882a593Smuzhiyun	add	y1,h		# h = k + w + h + S0                    # --
297*4882a593Smuzhiyun	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
298*4882a593Smuzhiyun	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	add	y3,h		# h = t1 + S0 + MAJ                     # --
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun	ROTATE_ARGS
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun################################### RND N + 3 ############################
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
308*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
309*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
310*4882a593Smuzhiyun	offset = \disp + 3*4
311*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
312*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
316*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
317*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
318*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
319*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
323*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
324*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
325*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
326*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
329*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
330*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun	vpxor	XTMP3, XTMP2, XTMP2
333*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
334*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
337*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
338*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
341*4882a593Smuzhiyun	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
344*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
345*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
346*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
347*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
350*4882a593Smuzhiyun	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
351*4882a593Smuzhiyun	add	y3, h		# h = t1 + S0 + MAJ                     # --
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun	ROTATE_ARGS
354*4882a593Smuzhiyun	rotate_Xs
355*4882a593Smuzhiyun.endm
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun.macro DO_4ROUNDS disp
358*4882a593Smuzhiyun################################### RND N + 0 ###########################
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
361*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
362*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
363*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
366*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
367*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
370*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
371*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
372*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
373*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
376*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
377*4882a593Smuzhiyun	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
378*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
381*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
382*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
383*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
384*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
388*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
389*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
390*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun	ROTATE_ARGS
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun################################### RND N + 1 ###########################
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
397*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
398*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
399*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
400*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
403*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
404*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
405*4882a593Smuzhiyun	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
408*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
409*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
410*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
411*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
414*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
415*4882a593Smuzhiyun	offset = 4*1 + \disp
416*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h		# h = k + w + h # --
417*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
420*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
421*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
422*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
423*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
427*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
428*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun	ROTATE_ARGS
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun################################### RND N + 2 ##############################
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
437*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
438*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
439*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
440*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
443*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
444*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
445*4882a593Smuzhiyun	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
448*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
449*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
450*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
451*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
454*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
455*4882a593Smuzhiyun	offset = 4*2 + \disp
456*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h		# h = k + w + h # --
457*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
460*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
461*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
462*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
463*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
467*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
468*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun	ROTATE_ARGS
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun################################### RND N + 3 ###########################
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
477*4882a593Smuzhiyun	mov	f, y2		# y2 = f                                # CH
478*4882a593Smuzhiyun	rorx	$25, e, y0	# y0 = e >> 25				# S1A
479*4882a593Smuzhiyun	rorx	$11, e, y1	# y1 = e >> 11				# S1B
480*4882a593Smuzhiyun	xor	g, y2		# y2 = f^g                              # CH
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
483*4882a593Smuzhiyun	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
484*4882a593Smuzhiyun	and	e, y2		# y2 = (f^g)&e                          # CH
485*4882a593Smuzhiyun	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
488*4882a593Smuzhiyun	rorx	$13, a, T1	# T1 = a >> 13				# S0B
489*4882a593Smuzhiyun	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
490*4882a593Smuzhiyun	rorx	$22, a, y1	# y1 = a >> 22				# S0A
491*4882a593Smuzhiyun	mov	a, y3		# y3 = a                                # MAJA
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
494*4882a593Smuzhiyun	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
495*4882a593Smuzhiyun	offset = 4*3 + \disp
496*4882a593Smuzhiyun	addl	offset(%rsp, SRND), h		# h = k + w + h # --
497*4882a593Smuzhiyun	or	c, y3		# y3 = a|c                              # MAJA
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
500*4882a593Smuzhiyun	mov	a, T1		# T1 = a                                # MAJB
501*4882a593Smuzhiyun	and	b, y3		# y3 = (a|c)&b                          # MAJA
502*4882a593Smuzhiyun	and	c, T1		# T1 = a&c                              # MAJB
503*4882a593Smuzhiyun	add	y0, y2		# y2 = S1 + CH                          # --
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun	add	h, d		# d = k + w + h + d                     # --
507*4882a593Smuzhiyun	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
508*4882a593Smuzhiyun	add	y1, h		# h = k + w + h + S0                    # --
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
514*4882a593Smuzhiyun
515*4882a593Smuzhiyun	add	y3, h		# h = t1 + S0 + MAJ                     # --
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun	ROTATE_ARGS
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun.endm
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun########################################################################
522*4882a593Smuzhiyun## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
523*4882a593Smuzhiyun## arg 1 : pointer to state
524*4882a593Smuzhiyun## arg 2 : pointer to input data
525*4882a593Smuzhiyun## arg 3 : Num blocks
526*4882a593Smuzhiyun########################################################################
527*4882a593Smuzhiyun.text
528*4882a593SmuzhiyunSYM_FUNC_START(sha256_transform_rorx)
529*4882a593Smuzhiyun.align 32
530*4882a593Smuzhiyun	pushq	%rbx
531*4882a593Smuzhiyun	pushq	%r12
532*4882a593Smuzhiyun	pushq	%r13
533*4882a593Smuzhiyun	pushq	%r14
534*4882a593Smuzhiyun	pushq	%r15
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun	mov	%rsp, %rax
537*4882a593Smuzhiyun	subq	$STACK_SIZE, %rsp
538*4882a593Smuzhiyun	and	$-32, %rsp	# align rsp to 32 byte boundary
539*4882a593Smuzhiyun	mov	%rax, _RSP(%rsp)
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun	shl	$6, NUM_BLKS	# convert to bytes
543*4882a593Smuzhiyun	jz	done_hash
544*4882a593Smuzhiyun	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545*4882a593Smuzhiyun	mov	NUM_BLKS, _INP_END(%rsp)
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	cmp	NUM_BLKS, INP
548*4882a593Smuzhiyun	je	only_one_block
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun	## load initial digest
551*4882a593Smuzhiyun	mov	(CTX), a
552*4882a593Smuzhiyun	mov	4*1(CTX), b
553*4882a593Smuzhiyun	mov	4*2(CTX), c
554*4882a593Smuzhiyun	mov	4*3(CTX), d
555*4882a593Smuzhiyun	mov	4*4(CTX), e
556*4882a593Smuzhiyun	mov	4*5(CTX), f
557*4882a593Smuzhiyun	mov	4*6(CTX), g
558*4882a593Smuzhiyun	mov	4*7(CTX), h
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
561*4882a593Smuzhiyun	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
562*4882a593Smuzhiyun	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun	mov	CTX, _CTX(%rsp)
565*4882a593Smuzhiyun
566*4882a593Smuzhiyunloop0:
567*4882a593Smuzhiyun	## Load first 16 dwords from two blocks
568*4882a593Smuzhiyun	VMOVDQ	0*32(INP),XTMP0
569*4882a593Smuzhiyun	VMOVDQ	1*32(INP),XTMP1
570*4882a593Smuzhiyun	VMOVDQ	2*32(INP),XTMP2
571*4882a593Smuzhiyun	VMOVDQ	3*32(INP),XTMP3
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun	## byte swap data
574*4882a593Smuzhiyun	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
575*4882a593Smuzhiyun	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
576*4882a593Smuzhiyun	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
577*4882a593Smuzhiyun	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun	## transpose data into high/low halves
580*4882a593Smuzhiyun	vperm2i128	$0x20, XTMP2, XTMP0, X0
581*4882a593Smuzhiyun	vperm2i128	$0x31, XTMP2, XTMP0, X1
582*4882a593Smuzhiyun	vperm2i128	$0x20, XTMP3, XTMP1, X2
583*4882a593Smuzhiyun	vperm2i128	$0x31, XTMP3, XTMP1, X3
584*4882a593Smuzhiyun
585*4882a593Smuzhiyunlast_block_enter:
586*4882a593Smuzhiyun	add	$64, INP
587*4882a593Smuzhiyun	mov	INP, _INP(%rsp)
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun	## schedule 48 input dwords, by doing 3 rounds of 12 each
590*4882a593Smuzhiyun	xor	SRND, SRND
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun.align 16
593*4882a593Smuzhiyunloop1:
594*4882a593Smuzhiyun	vpaddd	K256+0*32(SRND), X0, XFER
595*4882a593Smuzhiyun	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
596*4882a593Smuzhiyun	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun	vpaddd	K256+1*32(SRND), X0, XFER
599*4882a593Smuzhiyun	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600*4882a593Smuzhiyun	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun	vpaddd	K256+2*32(SRND), X0, XFER
603*4882a593Smuzhiyun	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
604*4882a593Smuzhiyun	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun	vpaddd	K256+3*32(SRND), X0, XFER
607*4882a593Smuzhiyun	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
608*4882a593Smuzhiyun	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun	add	$4*32, SRND
611*4882a593Smuzhiyun	cmp	$3*4*32, SRND
612*4882a593Smuzhiyun	jb	loop1
613*4882a593Smuzhiyun
614*4882a593Smuzhiyunloop2:
615*4882a593Smuzhiyun	## Do last 16 rounds with no scheduling
616*4882a593Smuzhiyun	vpaddd	K256+0*32(SRND), X0, XFER
617*4882a593Smuzhiyun	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
618*4882a593Smuzhiyun	DO_4ROUNDS	_XFER + 0*32
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun	vpaddd	K256+1*32(SRND), X1, XFER
621*4882a593Smuzhiyun	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622*4882a593Smuzhiyun	DO_4ROUNDS	_XFER + 1*32
623*4882a593Smuzhiyun	add	$2*32, SRND
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun	vmovdqa	X2, X0
626*4882a593Smuzhiyun	vmovdqa	X3, X1
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun	cmp	$4*4*32, SRND
629*4882a593Smuzhiyun	jb	loop2
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun	mov	_CTX(%rsp), CTX
632*4882a593Smuzhiyun	mov	_INP(%rsp), INP
633*4882a593Smuzhiyun
634*4882a593Smuzhiyun	addm    (4*0)(CTX),a
635*4882a593Smuzhiyun	addm    (4*1)(CTX),b
636*4882a593Smuzhiyun	addm    (4*2)(CTX),c
637*4882a593Smuzhiyun	addm    (4*3)(CTX),d
638*4882a593Smuzhiyun	addm    (4*4)(CTX),e
639*4882a593Smuzhiyun	addm    (4*5)(CTX),f
640*4882a593Smuzhiyun	addm    (4*6)(CTX),g
641*4882a593Smuzhiyun	addm    (4*7)(CTX),h
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun	cmp	_INP_END(%rsp), INP
644*4882a593Smuzhiyun	ja	done_hash
645*4882a593Smuzhiyun
646*4882a593Smuzhiyun	#### Do second block using previously scheduled results
647*4882a593Smuzhiyun	xor	SRND, SRND
648*4882a593Smuzhiyun.align 16
649*4882a593Smuzhiyunloop3:
650*4882a593Smuzhiyun	DO_4ROUNDS	 _XFER + 0*32 + 16
651*4882a593Smuzhiyun	DO_4ROUNDS	 _XFER + 1*32 + 16
652*4882a593Smuzhiyun	add	$2*32, SRND
653*4882a593Smuzhiyun	cmp	$4*4*32, SRND
654*4882a593Smuzhiyun	jb	loop3
655*4882a593Smuzhiyun
656*4882a593Smuzhiyun	mov	_CTX(%rsp), CTX
657*4882a593Smuzhiyun	mov	_INP(%rsp), INP
658*4882a593Smuzhiyun	add	$64, INP
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun	addm    (4*0)(CTX),a
661*4882a593Smuzhiyun	addm    (4*1)(CTX),b
662*4882a593Smuzhiyun	addm    (4*2)(CTX),c
663*4882a593Smuzhiyun	addm    (4*3)(CTX),d
664*4882a593Smuzhiyun	addm    (4*4)(CTX),e
665*4882a593Smuzhiyun	addm    (4*5)(CTX),f
666*4882a593Smuzhiyun	addm    (4*6)(CTX),g
667*4882a593Smuzhiyun	addm    (4*7)(CTX),h
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun	cmp	_INP_END(%rsp), INP
670*4882a593Smuzhiyun	jb	loop0
671*4882a593Smuzhiyun	ja	done_hash
672*4882a593Smuzhiyun
673*4882a593Smuzhiyundo_last_block:
674*4882a593Smuzhiyun	VMOVDQ	0*16(INP),XWORD0
675*4882a593Smuzhiyun	VMOVDQ	1*16(INP),XWORD1
676*4882a593Smuzhiyun	VMOVDQ	2*16(INP),XWORD2
677*4882a593Smuzhiyun	VMOVDQ	3*16(INP),XWORD3
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
680*4882a593Smuzhiyun	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
681*4882a593Smuzhiyun	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
682*4882a593Smuzhiyun	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun	jmp	last_block_enter
685*4882a593Smuzhiyun
686*4882a593Smuzhiyunonly_one_block:
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun	## load initial digest
689*4882a593Smuzhiyun	mov	(4*0)(CTX),a
690*4882a593Smuzhiyun	mov	(4*1)(CTX),b
691*4882a593Smuzhiyun	mov	(4*2)(CTX),c
692*4882a593Smuzhiyun	mov	(4*3)(CTX),d
693*4882a593Smuzhiyun	mov	(4*4)(CTX),e
694*4882a593Smuzhiyun	mov	(4*5)(CTX),f
695*4882a593Smuzhiyun	mov	(4*6)(CTX),g
696*4882a593Smuzhiyun	mov	(4*7)(CTX),h
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699*4882a593Smuzhiyun	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
700*4882a593Smuzhiyun	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun	mov	CTX, _CTX(%rsp)
703*4882a593Smuzhiyun	jmp	do_last_block
704*4882a593Smuzhiyun
705*4882a593Smuzhiyundone_hash:
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun	mov	_RSP(%rsp), %rsp
708*4882a593Smuzhiyun
709*4882a593Smuzhiyun	popq	%r15
710*4882a593Smuzhiyun	popq	%r14
711*4882a593Smuzhiyun	popq	%r13
712*4882a593Smuzhiyun	popq	%r12
713*4882a593Smuzhiyun	popq	%rbx
714*4882a593Smuzhiyun	RET
715*4882a593SmuzhiyunSYM_FUNC_END(sha256_transform_rorx)
716*4882a593Smuzhiyun
717*4882a593Smuzhiyun.section	.rodata.cst512.K256, "aM", @progbits, 512
718*4882a593Smuzhiyun.align 64
719*4882a593SmuzhiyunK256:
720*4882a593Smuzhiyun	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721*4882a593Smuzhiyun	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722*4882a593Smuzhiyun	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723*4882a593Smuzhiyun	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724*4882a593Smuzhiyun	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725*4882a593Smuzhiyun	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726*4882a593Smuzhiyun	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727*4882a593Smuzhiyun	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728*4882a593Smuzhiyun	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729*4882a593Smuzhiyun	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730*4882a593Smuzhiyun	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731*4882a593Smuzhiyun	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732*4882a593Smuzhiyun	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733*4882a593Smuzhiyun	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734*4882a593Smuzhiyun	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735*4882a593Smuzhiyun	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736*4882a593Smuzhiyun	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737*4882a593Smuzhiyun	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738*4882a593Smuzhiyun	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739*4882a593Smuzhiyun	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740*4882a593Smuzhiyun	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741*4882a593Smuzhiyun	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742*4882a593Smuzhiyun	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743*4882a593Smuzhiyun	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744*4882a593Smuzhiyun	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745*4882a593Smuzhiyun	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746*4882a593Smuzhiyun	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747*4882a593Smuzhiyun	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748*4882a593Smuzhiyun	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749*4882a593Smuzhiyun	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750*4882a593Smuzhiyun	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751*4882a593Smuzhiyun	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
754*4882a593Smuzhiyun.align 32
755*4882a593SmuzhiyunPSHUFFLE_BYTE_FLIP_MASK:
756*4882a593Smuzhiyun	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun# shuffle xBxA -> 00BA
759*4882a593Smuzhiyun.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
760*4882a593Smuzhiyun.align 32
761*4882a593Smuzhiyun_SHUF_00BA:
762*4882a593Smuzhiyun	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
763*4882a593Smuzhiyun
764*4882a593Smuzhiyun# shuffle xDxC -> DC00
765*4882a593Smuzhiyun.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
766*4882a593Smuzhiyun.align 32
767*4882a593Smuzhiyun_SHUF_DC00:
768*4882a593Smuzhiyun	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
769