xref: /OK3568_Linux_fs/kernel/drivers/crypto/vmx/aesp8-ppc.pl (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun#! /usr/bin/env perl
2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0
3*4882a593Smuzhiyun
4*4882a593Smuzhiyun# This code is taken from CRYPTOGAMs[1] and is included here using the option
5*4882a593Smuzhiyun# in the license to distribute the code under the GPL. Therefore this program
6*4882a593Smuzhiyun# is free software; you can redistribute it and/or modify it under the terms of
7*4882a593Smuzhiyun# the GNU General Public License version 2 as published by the Free Software
8*4882a593Smuzhiyun# Foundation.
9*4882a593Smuzhiyun#
10*4882a593Smuzhiyun# [1] https://www.openssl.org/~appro/cryptogams/
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
13*4882a593Smuzhiyun# All rights reserved.
14*4882a593Smuzhiyun#
15*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or without
16*4882a593Smuzhiyun# modification, are permitted provided that the following conditions
17*4882a593Smuzhiyun# are met:
18*4882a593Smuzhiyun#
19*4882a593Smuzhiyun#       * Redistributions of source code must retain copyright notices,
20*4882a593Smuzhiyun#         this list of conditions and the following disclaimer.
21*4882a593Smuzhiyun#
22*4882a593Smuzhiyun#       * Redistributions in binary form must reproduce the above
23*4882a593Smuzhiyun#         copyright notice, this list of conditions and the following
24*4882a593Smuzhiyun#         disclaimer in the documentation and/or other materials
25*4882a593Smuzhiyun#         provided with the distribution.
26*4882a593Smuzhiyun#
27*4882a593Smuzhiyun#       * Neither the name of the CRYPTOGAMS nor the names of its
28*4882a593Smuzhiyun#         copyright holder and contributors may be used to endorse or
29*4882a593Smuzhiyun#         promote products derived from this software without specific
30*4882a593Smuzhiyun#         prior written permission.
31*4882a593Smuzhiyun#
32*4882a593Smuzhiyun# ALTERNATIVELY, provided that this notice is retained in full, this
33*4882a593Smuzhiyun# product may be distributed under the terms of the GNU General Public
34*4882a593Smuzhiyun# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
35*4882a593Smuzhiyun# those given above.
36*4882a593Smuzhiyun#
37*4882a593Smuzhiyun# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
38*4882a593Smuzhiyun# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39*4882a593Smuzhiyun# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40*4882a593Smuzhiyun# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41*4882a593Smuzhiyun# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42*4882a593Smuzhiyun# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43*4882a593Smuzhiyun# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44*4882a593Smuzhiyun# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45*4882a593Smuzhiyun# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46*4882a593Smuzhiyun# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47*4882a593Smuzhiyun# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun# ====================================================================
50*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and
52*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further
53*4882a593Smuzhiyun# details see https://www.openssl.org/~appro/cryptogams/.
54*4882a593Smuzhiyun# ====================================================================
55*4882a593Smuzhiyun#
56*4882a593Smuzhiyun# This module implements support for AES instructions as per PowerISA
57*4882a593Smuzhiyun# specification version 2.07, first implemented by POWER8 processor.
58*4882a593Smuzhiyun# The module is endian-agnostic in sense that it supports both big-
59*4882a593Smuzhiyun# and little-endian cases. Data alignment in parallelizable modes is
60*4882a593Smuzhiyun# handled with VSX loads and stores, which implies MSR.VSX flag being
61*4882a593Smuzhiyun# set. It should also be noted that ISA specification doesn't prohibit
62*4882a593Smuzhiyun# alignment exceptions for these instructions on page boundaries.
63*4882a593Smuzhiyun# Initially alignment was handled in pure AltiVec/VMX way [when data
64*4882a593Smuzhiyun# is aligned programmatically, which in turn guarantees exception-
65*4882a593Smuzhiyun# free execution], but it turned to hamper performance when vcipher
66*4882a593Smuzhiyun# instructions are interleaved. It's reckoned that eventual
67*4882a593Smuzhiyun# misalignment penalties at page boundaries are in average lower
68*4882a593Smuzhiyun# than additional overhead in pure AltiVec approach.
69*4882a593Smuzhiyun#
70*4882a593Smuzhiyun# May 2016
71*4882a593Smuzhiyun#
72*4882a593Smuzhiyun# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
73*4882a593Smuzhiyun# systems were measured.
74*4882a593Smuzhiyun#
75*4882a593Smuzhiyun######################################################################
76*4882a593Smuzhiyun# Current large-block performance in cycles per byte processed with
77*4882a593Smuzhiyun# 128-bit key (less is better).
78*4882a593Smuzhiyun#
79*4882a593Smuzhiyun#		CBC en-/decrypt	CTR	XTS
80*4882a593Smuzhiyun# POWER8[le]	3.96/0.72	0.74	1.1
81*4882a593Smuzhiyun# POWER8[be]	3.75/0.65	0.66	1.0
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun$flavour = shift;
84*4882a593Smuzhiyun
85*4882a593Smuzhiyunif ($flavour =~ /64/) {
86*4882a593Smuzhiyun	$SIZE_T	=8;
87*4882a593Smuzhiyun	$LRSAVE	=2*$SIZE_T;
88*4882a593Smuzhiyun	$STU	="stdu";
89*4882a593Smuzhiyun	$POP	="ld";
90*4882a593Smuzhiyun	$PUSH	="std";
91*4882a593Smuzhiyun	$UCMP	="cmpld";
92*4882a593Smuzhiyun	$SHL	="sldi";
93*4882a593Smuzhiyun} elsif ($flavour =~ /32/) {
94*4882a593Smuzhiyun	$SIZE_T	=4;
95*4882a593Smuzhiyun	$LRSAVE	=$SIZE_T;
96*4882a593Smuzhiyun	$STU	="stwu";
97*4882a593Smuzhiyun	$POP	="lwz";
98*4882a593Smuzhiyun	$PUSH	="stw";
99*4882a593Smuzhiyun	$UCMP	="cmplw";
100*4882a593Smuzhiyun	$SHL	="slwi";
101*4882a593Smuzhiyun} else { die "nonsense $flavour"; }
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
106*4882a593Smuzhiyun( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
107*4882a593Smuzhiyun( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
108*4882a593Smuzhiyundie "can't locate ppc-xlate.pl";
109*4882a593Smuzhiyun
110*4882a593Smuzhiyunopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun$FRAME=8*$SIZE_T;
113*4882a593Smuzhiyun$prefix="aes_p8";
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun$sp="r1";
116*4882a593Smuzhiyun$vrsave="r12";
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun#########################################################################
119*4882a593Smuzhiyun{{{	# Key setup procedures						#
120*4882a593Smuzhiyunmy ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
121*4882a593Smuzhiyunmy ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
122*4882a593Smuzhiyunmy ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun$code.=<<___;
125*4882a593Smuzhiyun.machine	"any"
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun.text
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun.align	7
130*4882a593Smuzhiyunrcon:
131*4882a593Smuzhiyun.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
132*4882a593Smuzhiyun.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
133*4882a593Smuzhiyun.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
134*4882a593Smuzhiyun.long	0,0,0,0						?asis
135*4882a593SmuzhiyunLconsts:
136*4882a593Smuzhiyun	mflr	r0
137*4882a593Smuzhiyun	bcl	20,31,\$+4
138*4882a593Smuzhiyun	mflr	$ptr	 #vvvvv "distance between . and rcon
139*4882a593Smuzhiyun	addi	$ptr,$ptr,-0x48
140*4882a593Smuzhiyun	mtlr	r0
141*4882a593Smuzhiyun	blr
142*4882a593Smuzhiyun	.long	0
143*4882a593Smuzhiyun	.byte	0,12,0x14,0,0,0,0,0
144*4882a593Smuzhiyun.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun.globl	.${prefix}_set_encrypt_key
147*4882a593SmuzhiyunLset_encrypt_key:
148*4882a593Smuzhiyun	mflr		r11
149*4882a593Smuzhiyun	$PUSH		r11,$LRSAVE($sp)
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	li		$ptr,-1
152*4882a593Smuzhiyun	${UCMP}i	$inp,0
153*4882a593Smuzhiyun	beq-		Lenc_key_abort		# if ($inp==0) return -1;
154*4882a593Smuzhiyun	${UCMP}i	$out,0
155*4882a593Smuzhiyun	beq-		Lenc_key_abort		# if ($out==0) return -1;
156*4882a593Smuzhiyun	li		$ptr,-2
157*4882a593Smuzhiyun	cmpwi		$bits,128
158*4882a593Smuzhiyun	blt-		Lenc_key_abort
159*4882a593Smuzhiyun	cmpwi		$bits,256
160*4882a593Smuzhiyun	bgt-		Lenc_key_abort
161*4882a593Smuzhiyun	andi.		r0,$bits,0x3f
162*4882a593Smuzhiyun	bne-		Lenc_key_abort
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun	lis		r0,0xfff0
165*4882a593Smuzhiyun	mfspr		$vrsave,256
166*4882a593Smuzhiyun	mtspr		256,r0
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	bl		Lconsts
169*4882a593Smuzhiyun	mtlr		r11
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	neg		r9,$inp
172*4882a593Smuzhiyun	lvx		$in0,0,$inp
173*4882a593Smuzhiyun	addi		$inp,$inp,15		# 15 is not typo
174*4882a593Smuzhiyun	lvsr		$key,0,r9		# borrow $key
175*4882a593Smuzhiyun	li		r8,0x20
176*4882a593Smuzhiyun	cmpwi		$bits,192
177*4882a593Smuzhiyun	lvx		$in1,0,$inp
178*4882a593Smuzhiyun	le?vspltisb	$mask,0x0f		# borrow $mask
179*4882a593Smuzhiyun	lvx		$rcon,0,$ptr
180*4882a593Smuzhiyun	le?vxor		$key,$key,$mask		# adjust for byte swap
181*4882a593Smuzhiyun	lvx		$mask,r8,$ptr
182*4882a593Smuzhiyun	addi		$ptr,$ptr,0x10
183*4882a593Smuzhiyun	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
184*4882a593Smuzhiyun	li		$cnt,8
185*4882a593Smuzhiyun	vxor		$zero,$zero,$zero
186*4882a593Smuzhiyun	mtctr		$cnt
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	?lvsr		$outperm,0,$out
189*4882a593Smuzhiyun	vspltisb	$outmask,-1
190*4882a593Smuzhiyun	lvx		$outhead,0,$out
191*4882a593Smuzhiyun	?vperm		$outmask,$zero,$outmask,$outperm
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	blt		Loop128
194*4882a593Smuzhiyun	addi		$inp,$inp,8
195*4882a593Smuzhiyun	beq		L192
196*4882a593Smuzhiyun	addi		$inp,$inp,8
197*4882a593Smuzhiyun	b		L256
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun.align	4
200*4882a593SmuzhiyunLoop128:
201*4882a593Smuzhiyun	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
202*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
203*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
204*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
205*4882a593Smuzhiyun	 vmr		$outhead,$outtail
206*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
207*4882a593Smuzhiyun	 stvx		$stage,0,$out
208*4882a593Smuzhiyun	 addi		$out,$out,16
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
211*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
212*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
213*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
214*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
215*4882a593Smuzhiyun	 vadduwm	$rcon,$rcon,$rcon
216*4882a593Smuzhiyun	vxor		$in0,$in0,$key
217*4882a593Smuzhiyun	bdnz		Loop128
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun	lvx		$rcon,0,$ptr		# last two round keys
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
222*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
223*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
224*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
225*4882a593Smuzhiyun	 vmr		$outhead,$outtail
226*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
227*4882a593Smuzhiyun	 stvx		$stage,0,$out
228*4882a593Smuzhiyun	 addi		$out,$out,16
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
231*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
232*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
233*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
234*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
235*4882a593Smuzhiyun	 vadduwm	$rcon,$rcon,$rcon
236*4882a593Smuzhiyun	vxor		$in0,$in0,$key
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
239*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
240*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
241*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
242*4882a593Smuzhiyun	 vmr		$outhead,$outtail
243*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
244*4882a593Smuzhiyun	 stvx		$stage,0,$out
245*4882a593Smuzhiyun	 addi		$out,$out,16
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
248*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
249*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
250*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
251*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
252*4882a593Smuzhiyun	vxor		$in0,$in0,$key
253*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
254*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
255*4882a593Smuzhiyun	 vmr		$outhead,$outtail
256*4882a593Smuzhiyun	 stvx		$stage,0,$out
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	addi		$inp,$out,15		# 15 is not typo
259*4882a593Smuzhiyun	addi		$out,$out,0x50
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun	li		$rounds,10
262*4882a593Smuzhiyun	b		Ldone
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun.align	4
265*4882a593SmuzhiyunL192:
266*4882a593Smuzhiyun	lvx		$tmp,0,$inp
267*4882a593Smuzhiyun	li		$cnt,4
268*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
269*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
270*4882a593Smuzhiyun	 vmr		$outhead,$outtail
271*4882a593Smuzhiyun	 stvx		$stage,0,$out
272*4882a593Smuzhiyun	 addi		$out,$out,16
273*4882a593Smuzhiyun	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
274*4882a593Smuzhiyun	vspltisb	$key,8			# borrow $key
275*4882a593Smuzhiyun	mtctr		$cnt
276*4882a593Smuzhiyun	vsububm		$mask,$mask,$key	# adjust the mask
277*4882a593Smuzhiyun
278*4882a593SmuzhiyunLoop192:
279*4882a593Smuzhiyun	vperm		$key,$in1,$in1,$mask	# roate-n-splat
280*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
281*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
284*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
285*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
286*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
287*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	 vsldoi		$stage,$zero,$in1,8
290*4882a593Smuzhiyun	vspltw		$tmp,$in0,3
291*4882a593Smuzhiyun	vxor		$tmp,$tmp,$in1
292*4882a593Smuzhiyun	vsldoi		$in1,$zero,$in1,12	# >>32
293*4882a593Smuzhiyun	 vadduwm	$rcon,$rcon,$rcon
294*4882a593Smuzhiyun	vxor		$in1,$in1,$tmp
295*4882a593Smuzhiyun	vxor		$in0,$in0,$key
296*4882a593Smuzhiyun	vxor		$in1,$in1,$key
297*4882a593Smuzhiyun	 vsldoi		$stage,$stage,$in0,8
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
300*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
301*4882a593Smuzhiyun	 vperm		$outtail,$stage,$stage,$outperm	# rotate
302*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
303*4882a593Smuzhiyun	 vmr		$outhead,$outtail
304*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
305*4882a593Smuzhiyun	 stvx		$stage,0,$out
306*4882a593Smuzhiyun	 addi		$out,$out,16
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun	 vsldoi		$stage,$in0,$in1,8
309*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
310*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
311*4882a593Smuzhiyun	 vperm		$outtail,$stage,$stage,$outperm	# rotate
312*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
313*4882a593Smuzhiyun	 vmr		$outhead,$outtail
314*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
315*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
316*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
317*4882a593Smuzhiyun	 stvx		$stage,0,$out
318*4882a593Smuzhiyun	 addi		$out,$out,16
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun	vspltw		$tmp,$in0,3
321*4882a593Smuzhiyun	vxor		$tmp,$tmp,$in1
322*4882a593Smuzhiyun	vsldoi		$in1,$zero,$in1,12	# >>32
323*4882a593Smuzhiyun	 vadduwm	$rcon,$rcon,$rcon
324*4882a593Smuzhiyun	vxor		$in1,$in1,$tmp
325*4882a593Smuzhiyun	vxor		$in0,$in0,$key
326*4882a593Smuzhiyun	vxor		$in1,$in1,$key
327*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
328*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
329*4882a593Smuzhiyun	 vmr		$outhead,$outtail
330*4882a593Smuzhiyun	 stvx		$stage,0,$out
331*4882a593Smuzhiyun	 addi		$inp,$out,15		# 15 is not typo
332*4882a593Smuzhiyun	 addi		$out,$out,16
333*4882a593Smuzhiyun	bdnz		Loop192
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun	li		$rounds,12
336*4882a593Smuzhiyun	addi		$out,$out,0x20
337*4882a593Smuzhiyun	b		Ldone
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun.align	4
340*4882a593SmuzhiyunL256:
341*4882a593Smuzhiyun	lvx		$tmp,0,$inp
342*4882a593Smuzhiyun	li		$cnt,7
343*4882a593Smuzhiyun	li		$rounds,14
344*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
345*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
346*4882a593Smuzhiyun	 vmr		$outhead,$outtail
347*4882a593Smuzhiyun	 stvx		$stage,0,$out
348*4882a593Smuzhiyun	 addi		$out,$out,16
349*4882a593Smuzhiyun	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
350*4882a593Smuzhiyun	mtctr		$cnt
351*4882a593Smuzhiyun
352*4882a593SmuzhiyunLoop256:
353*4882a593Smuzhiyun	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
354*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in0,12	# >>32
355*4882a593Smuzhiyun	 vperm		$outtail,$in1,$in1,$outperm	# rotate
356*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
357*4882a593Smuzhiyun	 vmr		$outhead,$outtail
358*4882a593Smuzhiyun	vcipherlast	$key,$key,$rcon
359*4882a593Smuzhiyun	 stvx		$stage,0,$out
360*4882a593Smuzhiyun	 addi		$out,$out,16
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
363*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
364*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
365*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
366*4882a593Smuzhiyun	vxor		$in0,$in0,$tmp
367*4882a593Smuzhiyun	 vadduwm	$rcon,$rcon,$rcon
368*4882a593Smuzhiyun	vxor		$in0,$in0,$key
369*4882a593Smuzhiyun	 vperm		$outtail,$in0,$in0,$outperm	# rotate
370*4882a593Smuzhiyun	 vsel		$stage,$outhead,$outtail,$outmask
371*4882a593Smuzhiyun	 vmr		$outhead,$outtail
372*4882a593Smuzhiyun	 stvx		$stage,0,$out
373*4882a593Smuzhiyun	 addi		$inp,$out,15		# 15 is not typo
374*4882a593Smuzhiyun	 addi		$out,$out,16
375*4882a593Smuzhiyun	bdz		Ldone
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun	vspltw		$key,$in0,3		# just splat
378*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$in1,12	# >>32
379*4882a593Smuzhiyun	vsbox		$key,$key
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun	vxor		$in1,$in1,$tmp
382*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
383*4882a593Smuzhiyun	vxor		$in1,$in1,$tmp
384*4882a593Smuzhiyun	vsldoi		$tmp,$zero,$tmp,12	# >>32
385*4882a593Smuzhiyun	vxor		$in1,$in1,$tmp
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun	vxor		$in1,$in1,$key
388*4882a593Smuzhiyun	b		Loop256
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun.align	4
391*4882a593SmuzhiyunLdone:
392*4882a593Smuzhiyun	lvx		$in1,0,$inp		# redundant in aligned case
393*4882a593Smuzhiyun	vsel		$in1,$outhead,$in1,$outmask
394*4882a593Smuzhiyun	stvx		$in1,0,$inp
395*4882a593Smuzhiyun	li		$ptr,0
396*4882a593Smuzhiyun	mtspr		256,$vrsave
397*4882a593Smuzhiyun	stw		$rounds,0($out)
398*4882a593Smuzhiyun
399*4882a593SmuzhiyunLenc_key_abort:
400*4882a593Smuzhiyun	mr		r3,$ptr
401*4882a593Smuzhiyun	blr
402*4882a593Smuzhiyun	.long		0
403*4882a593Smuzhiyun	.byte		0,12,0x14,1,0,0,3,0
404*4882a593Smuzhiyun	.long		0
405*4882a593Smuzhiyun.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun.globl	.${prefix}_set_decrypt_key
408*4882a593Smuzhiyun	$STU		$sp,-$FRAME($sp)
409*4882a593Smuzhiyun	mflr		r10
410*4882a593Smuzhiyun	$PUSH		r10,$FRAME+$LRSAVE($sp)
411*4882a593Smuzhiyun	bl		Lset_encrypt_key
412*4882a593Smuzhiyun	mtlr		r10
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun	cmpwi		r3,0
415*4882a593Smuzhiyun	bne-		Ldec_key_abort
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun	slwi		$cnt,$rounds,4
418*4882a593Smuzhiyun	subi		$inp,$out,240		# first round key
419*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
420*4882a593Smuzhiyun	add		$out,$inp,$cnt		# last round key
421*4882a593Smuzhiyun	mtctr		$rounds
422*4882a593Smuzhiyun
423*4882a593SmuzhiyunLdeckey:
424*4882a593Smuzhiyun	lwz		r0, 0($inp)
425*4882a593Smuzhiyun	lwz		r6, 4($inp)
426*4882a593Smuzhiyun	lwz		r7, 8($inp)
427*4882a593Smuzhiyun	lwz		r8, 12($inp)
428*4882a593Smuzhiyun	addi		$inp,$inp,16
429*4882a593Smuzhiyun	lwz		r9, 0($out)
430*4882a593Smuzhiyun	lwz		r10,4($out)
431*4882a593Smuzhiyun	lwz		r11,8($out)
432*4882a593Smuzhiyun	lwz		r12,12($out)
433*4882a593Smuzhiyun	stw		r0, 0($out)
434*4882a593Smuzhiyun	stw		r6, 4($out)
435*4882a593Smuzhiyun	stw		r7, 8($out)
436*4882a593Smuzhiyun	stw		r8, 12($out)
437*4882a593Smuzhiyun	subi		$out,$out,16
438*4882a593Smuzhiyun	stw		r9, -16($inp)
439*4882a593Smuzhiyun	stw		r10,-12($inp)
440*4882a593Smuzhiyun	stw		r11,-8($inp)
441*4882a593Smuzhiyun	stw		r12,-4($inp)
442*4882a593Smuzhiyun	bdnz		Ldeckey
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	xor		r3,r3,r3		# return value
445*4882a593SmuzhiyunLdec_key_abort:
446*4882a593Smuzhiyun	addi		$sp,$sp,$FRAME
447*4882a593Smuzhiyun	blr
448*4882a593Smuzhiyun	.long		0
449*4882a593Smuzhiyun	.byte		0,12,4,1,0x80,0,3,0
450*4882a593Smuzhiyun	.long		0
451*4882a593Smuzhiyun.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
452*4882a593Smuzhiyun___
453*4882a593Smuzhiyun}}}
454*4882a593Smuzhiyun#########################################################################
455*4882a593Smuzhiyun{{{	# Single block en- and decrypt procedures			#
456*4882a593Smuzhiyunsub gen_block () {
457*4882a593Smuzhiyunmy $dir = shift;
458*4882a593Smuzhiyunmy $n   = $dir eq "de" ? "n" : "";
459*4882a593Smuzhiyunmy ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun$code.=<<___;
462*4882a593Smuzhiyun.globl	.${prefix}_${dir}crypt
463*4882a593Smuzhiyun	lwz		$rounds,240($key)
464*4882a593Smuzhiyun	lis		r0,0xfc00
465*4882a593Smuzhiyun	mfspr		$vrsave,256
466*4882a593Smuzhiyun	li		$idx,15			# 15 is not typo
467*4882a593Smuzhiyun	mtspr		256,r0
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun	lvx		v0,0,$inp
470*4882a593Smuzhiyun	neg		r11,$out
471*4882a593Smuzhiyun	lvx		v1,$idx,$inp
472*4882a593Smuzhiyun	lvsl		v2,0,$inp		# inpperm
473*4882a593Smuzhiyun	le?vspltisb	v4,0x0f
474*4882a593Smuzhiyun	?lvsl		v3,0,r11		# outperm
475*4882a593Smuzhiyun	le?vxor		v2,v2,v4
476*4882a593Smuzhiyun	li		$idx,16
477*4882a593Smuzhiyun	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
478*4882a593Smuzhiyun	lvx		v1,0,$key
479*4882a593Smuzhiyun	?lvsl		v5,0,$key		# keyperm
480*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
481*4882a593Smuzhiyun	lvx		v2,$idx,$key
482*4882a593Smuzhiyun	addi		$idx,$idx,16
483*4882a593Smuzhiyun	subi		$rounds,$rounds,1
484*4882a593Smuzhiyun	?vperm		v1,v1,v2,v5		# align round key
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun	vxor		v0,v0,v1
487*4882a593Smuzhiyun	lvx		v1,$idx,$key
488*4882a593Smuzhiyun	addi		$idx,$idx,16
489*4882a593Smuzhiyun	mtctr		$rounds
490*4882a593Smuzhiyun
491*4882a593SmuzhiyunLoop_${dir}c:
492*4882a593Smuzhiyun	?vperm		v2,v2,v1,v5
493*4882a593Smuzhiyun	v${n}cipher	v0,v0,v2
494*4882a593Smuzhiyun	lvx		v2,$idx,$key
495*4882a593Smuzhiyun	addi		$idx,$idx,16
496*4882a593Smuzhiyun	?vperm		v1,v1,v2,v5
497*4882a593Smuzhiyun	v${n}cipher	v0,v0,v1
498*4882a593Smuzhiyun	lvx		v1,$idx,$key
499*4882a593Smuzhiyun	addi		$idx,$idx,16
500*4882a593Smuzhiyun	bdnz		Loop_${dir}c
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun	?vperm		v2,v2,v1,v5
503*4882a593Smuzhiyun	v${n}cipher	v0,v0,v2
504*4882a593Smuzhiyun	lvx		v2,$idx,$key
505*4882a593Smuzhiyun	?vperm		v1,v1,v2,v5
506*4882a593Smuzhiyun	v${n}cipherlast	v0,v0,v1
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	vspltisb	v2,-1
509*4882a593Smuzhiyun	vxor		v1,v1,v1
510*4882a593Smuzhiyun	li		$idx,15			# 15 is not typo
511*4882a593Smuzhiyun	?vperm		v2,v1,v2,v3		# outmask
512*4882a593Smuzhiyun	le?vxor		v3,v3,v4
513*4882a593Smuzhiyun	lvx		v1,0,$out		# outhead
514*4882a593Smuzhiyun	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
515*4882a593Smuzhiyun	vsel		v1,v1,v0,v2
516*4882a593Smuzhiyun	lvx		v4,$idx,$out
517*4882a593Smuzhiyun	stvx		v1,0,$out
518*4882a593Smuzhiyun	vsel		v0,v0,v4,v2
519*4882a593Smuzhiyun	stvx		v0,$idx,$out
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun	mtspr		256,$vrsave
522*4882a593Smuzhiyun	blr
523*4882a593Smuzhiyun	.long		0
524*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,3,0
525*4882a593Smuzhiyun	.long		0
526*4882a593Smuzhiyun.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
527*4882a593Smuzhiyun___
528*4882a593Smuzhiyun}
529*4882a593Smuzhiyun&gen_block("en");
530*4882a593Smuzhiyun&gen_block("de");
531*4882a593Smuzhiyun}}}
532*4882a593Smuzhiyun#########################################################################
533*4882a593Smuzhiyun{{{	# CBC en- and decrypt procedures				#
534*4882a593Smuzhiyunmy ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
535*4882a593Smuzhiyunmy ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
536*4882a593Smuzhiyunmy ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
537*4882a593Smuzhiyun						map("v$_",(4..10));
538*4882a593Smuzhiyun$code.=<<___;
539*4882a593Smuzhiyun.globl	.${prefix}_cbc_encrypt
540*4882a593Smuzhiyun	${UCMP}i	$len,16
541*4882a593Smuzhiyun	bltlr-
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun	cmpwi		$enc,0			# test direction
544*4882a593Smuzhiyun	lis		r0,0xffe0
545*4882a593Smuzhiyun	mfspr		$vrsave,256
546*4882a593Smuzhiyun	mtspr		256,r0
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun	li		$idx,15
549*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$rndkey0
550*4882a593Smuzhiyun	le?vspltisb	$tmp,0x0f
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun	lvx		$ivec,0,$ivp		# load [unaligned] iv
553*4882a593Smuzhiyun	lvsl		$inpperm,0,$ivp
554*4882a593Smuzhiyun	lvx		$inptail,$idx,$ivp
555*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
556*4882a593Smuzhiyun	vperm		$ivec,$ivec,$inptail,$inpperm
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun	neg		r11,$inp
559*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key		# prepare for unaligned key
560*4882a593Smuzhiyun	lwz		$rounds,240($key)
561*4882a593Smuzhiyun
562*4882a593Smuzhiyun	lvsr		$inpperm,0,r11		# prepare for unaligned load
563*4882a593Smuzhiyun	lvx		$inptail,0,$inp
564*4882a593Smuzhiyun	addi		$inp,$inp,15		# 15 is not typo
565*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun	?lvsr		$outperm,0,$out		# prepare for unaligned store
568*4882a593Smuzhiyun	vspltisb	$outmask,-1
569*4882a593Smuzhiyun	lvx		$outhead,0,$out
570*4882a593Smuzhiyun	?vperm		$outmask,$rndkey0,$outmask,$outperm
571*4882a593Smuzhiyun	le?vxor		$outperm,$outperm,$tmp
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
574*4882a593Smuzhiyun	li		$idx,16
575*4882a593Smuzhiyun	subi		$rounds,$rounds,1
576*4882a593Smuzhiyun	beq		Lcbc_dec
577*4882a593Smuzhiyun
578*4882a593SmuzhiyunLcbc_enc:
579*4882a593Smuzhiyun	vmr		$inout,$inptail
580*4882a593Smuzhiyun	lvx		$inptail,0,$inp
581*4882a593Smuzhiyun	addi		$inp,$inp,16
582*4882a593Smuzhiyun	mtctr		$rounds
583*4882a593Smuzhiyun	subi		$len,$len,16		# len-=16
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun	lvx		$rndkey0,0,$key
586*4882a593Smuzhiyun	 vperm		$inout,$inout,$inptail,$inpperm
587*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
588*4882a593Smuzhiyun	addi		$idx,$idx,16
589*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
590*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
591*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
592*4882a593Smuzhiyun	addi		$idx,$idx,16
593*4882a593Smuzhiyun	vxor		$inout,$inout,$ivec
594*4882a593Smuzhiyun
595*4882a593SmuzhiyunLoop_cbc_enc:
596*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
597*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
598*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
599*4882a593Smuzhiyun	addi		$idx,$idx,16
600*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
601*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey0
602*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
603*4882a593Smuzhiyun	addi		$idx,$idx,16
604*4882a593Smuzhiyun	bdnz		Loop_cbc_enc
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
607*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
608*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
609*4882a593Smuzhiyun	li		$idx,16
610*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
611*4882a593Smuzhiyun	vcipherlast	$ivec,$inout,$rndkey0
612*4882a593Smuzhiyun	${UCMP}i	$len,16
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun	vperm		$tmp,$ivec,$ivec,$outperm
615*4882a593Smuzhiyun	vsel		$inout,$outhead,$tmp,$outmask
616*4882a593Smuzhiyun	vmr		$outhead,$tmp
617*4882a593Smuzhiyun	stvx		$inout,0,$out
618*4882a593Smuzhiyun	addi		$out,$out,16
619*4882a593Smuzhiyun	bge		Lcbc_enc
620*4882a593Smuzhiyun
621*4882a593Smuzhiyun	b		Lcbc_done
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun.align	4
624*4882a593SmuzhiyunLcbc_dec:
625*4882a593Smuzhiyun	${UCMP}i	$len,128
626*4882a593Smuzhiyun	bge		_aesp8_cbc_decrypt8x
627*4882a593Smuzhiyun	vmr		$tmp,$inptail
628*4882a593Smuzhiyun	lvx		$inptail,0,$inp
629*4882a593Smuzhiyun	addi		$inp,$inp,16
630*4882a593Smuzhiyun	mtctr		$rounds
631*4882a593Smuzhiyun	subi		$len,$len,16		# len-=16
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun	lvx		$rndkey0,0,$key
634*4882a593Smuzhiyun	 vperm		$tmp,$tmp,$inptail,$inpperm
635*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
636*4882a593Smuzhiyun	addi		$idx,$idx,16
637*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
638*4882a593Smuzhiyun	vxor		$inout,$tmp,$rndkey0
639*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
640*4882a593Smuzhiyun	addi		$idx,$idx,16
641*4882a593Smuzhiyun
642*4882a593SmuzhiyunLoop_cbc_dec:
643*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
644*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
645*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
646*4882a593Smuzhiyun	addi		$idx,$idx,16
647*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
648*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey0
649*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
650*4882a593Smuzhiyun	addi		$idx,$idx,16
651*4882a593Smuzhiyun	bdnz		Loop_cbc_dec
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
654*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
655*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
656*4882a593Smuzhiyun	li		$idx,16
657*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
658*4882a593Smuzhiyun	vncipherlast	$inout,$inout,$rndkey0
659*4882a593Smuzhiyun	${UCMP}i	$len,16
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun	vxor		$inout,$inout,$ivec
662*4882a593Smuzhiyun	vmr		$ivec,$tmp
663*4882a593Smuzhiyun	vperm		$tmp,$inout,$inout,$outperm
664*4882a593Smuzhiyun	vsel		$inout,$outhead,$tmp,$outmask
665*4882a593Smuzhiyun	vmr		$outhead,$tmp
666*4882a593Smuzhiyun	stvx		$inout,0,$out
667*4882a593Smuzhiyun	addi		$out,$out,16
668*4882a593Smuzhiyun	bge		Lcbc_dec
669*4882a593Smuzhiyun
670*4882a593SmuzhiyunLcbc_done:
671*4882a593Smuzhiyun	addi		$out,$out,-1
672*4882a593Smuzhiyun	lvx		$inout,0,$out		# redundant in aligned case
673*4882a593Smuzhiyun	vsel		$inout,$outhead,$inout,$outmask
674*4882a593Smuzhiyun	stvx		$inout,0,$out
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun	neg		$enc,$ivp		# write [unaligned] iv
677*4882a593Smuzhiyun	li		$idx,15			# 15 is not typo
678*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$rndkey0
679*4882a593Smuzhiyun	vspltisb	$outmask,-1
680*4882a593Smuzhiyun	le?vspltisb	$tmp,0x0f
681*4882a593Smuzhiyun	?lvsl		$outperm,0,$enc
682*4882a593Smuzhiyun	?vperm		$outmask,$rndkey0,$outmask,$outperm
683*4882a593Smuzhiyun	le?vxor		$outperm,$outperm,$tmp
684*4882a593Smuzhiyun	lvx		$outhead,0,$ivp
685*4882a593Smuzhiyun	vperm		$ivec,$ivec,$ivec,$outperm
686*4882a593Smuzhiyun	vsel		$inout,$outhead,$ivec,$outmask
687*4882a593Smuzhiyun	lvx		$inptail,$idx,$ivp
688*4882a593Smuzhiyun	stvx		$inout,0,$ivp
689*4882a593Smuzhiyun	vsel		$inout,$ivec,$inptail,$outmask
690*4882a593Smuzhiyun	stvx		$inout,$idx,$ivp
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun	mtspr		256,$vrsave
693*4882a593Smuzhiyun	blr
694*4882a593Smuzhiyun	.long		0
695*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,6,0
696*4882a593Smuzhiyun	.long		0
697*4882a593Smuzhiyun___
698*4882a593Smuzhiyun#########################################################################
699*4882a593Smuzhiyun{{	# Optimized CBC decrypt procedure				#
700*4882a593Smuzhiyunmy $key_="r11";
701*4882a593Smuzhiyunmy ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
702*4882a593Smuzhiyunmy ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
703*4882a593Smuzhiyunmy ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
704*4882a593Smuzhiyunmy $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
705*4882a593Smuzhiyun			# v26-v31 last 6 round keys
706*4882a593Smuzhiyunmy ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
707*4882a593Smuzhiyun
708*4882a593Smuzhiyun$code.=<<___;
709*4882a593Smuzhiyun.align	5
710*4882a593Smuzhiyun_aesp8_cbc_decrypt8x:
711*4882a593Smuzhiyun	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
712*4882a593Smuzhiyun	li		r10,`$FRAME+8*16+15`
713*4882a593Smuzhiyun	li		r11,`$FRAME+8*16+31`
714*4882a593Smuzhiyun	stvx		v20,r10,$sp		# ABI says so
715*4882a593Smuzhiyun	addi		r10,r10,32
716*4882a593Smuzhiyun	stvx		v21,r11,$sp
717*4882a593Smuzhiyun	addi		r11,r11,32
718*4882a593Smuzhiyun	stvx		v22,r10,$sp
719*4882a593Smuzhiyun	addi		r10,r10,32
720*4882a593Smuzhiyun	stvx		v23,r11,$sp
721*4882a593Smuzhiyun	addi		r11,r11,32
722*4882a593Smuzhiyun	stvx		v24,r10,$sp
723*4882a593Smuzhiyun	addi		r10,r10,32
724*4882a593Smuzhiyun	stvx		v25,r11,$sp
725*4882a593Smuzhiyun	addi		r11,r11,32
726*4882a593Smuzhiyun	stvx		v26,r10,$sp
727*4882a593Smuzhiyun	addi		r10,r10,32
728*4882a593Smuzhiyun	stvx		v27,r11,$sp
729*4882a593Smuzhiyun	addi		r11,r11,32
730*4882a593Smuzhiyun	stvx		v28,r10,$sp
731*4882a593Smuzhiyun	addi		r10,r10,32
732*4882a593Smuzhiyun	stvx		v29,r11,$sp
733*4882a593Smuzhiyun	addi		r11,r11,32
734*4882a593Smuzhiyun	stvx		v30,r10,$sp
735*4882a593Smuzhiyun	stvx		v31,r11,$sp
736*4882a593Smuzhiyun	li		r0,-1
737*4882a593Smuzhiyun	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
738*4882a593Smuzhiyun	li		$x10,0x10
739*4882a593Smuzhiyun	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
740*4882a593Smuzhiyun	li		$x20,0x20
741*4882a593Smuzhiyun	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
742*4882a593Smuzhiyun	li		$x30,0x30
743*4882a593Smuzhiyun	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
744*4882a593Smuzhiyun	li		$x40,0x40
745*4882a593Smuzhiyun	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
746*4882a593Smuzhiyun	li		$x50,0x50
747*4882a593Smuzhiyun	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
748*4882a593Smuzhiyun	li		$x60,0x60
749*4882a593Smuzhiyun	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
750*4882a593Smuzhiyun	li		$x70,0x70
751*4882a593Smuzhiyun	mtspr		256,r0
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun	subi		$rounds,$rounds,3	# -4 in total
754*4882a593Smuzhiyun	subi		$len,$len,128		# bias
755*4882a593Smuzhiyun
756*4882a593Smuzhiyun	lvx		$rndkey0,$x00,$key	# load key schedule
757*4882a593Smuzhiyun	lvx		v30,$x10,$key
758*4882a593Smuzhiyun	addi		$key,$key,0x20
759*4882a593Smuzhiyun	lvx		v31,$x00,$key
760*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,v30,$keyperm
761*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15
762*4882a593Smuzhiyun	mtctr		$rounds
763*4882a593Smuzhiyun
764*4882a593SmuzhiyunLoad_cbc_dec_key:
765*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
766*4882a593Smuzhiyun	lvx		v30,$x10,$key
767*4882a593Smuzhiyun	addi		$key,$key,0x20
768*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[1]
769*4882a593Smuzhiyun	?vperm		v25,v31,v30,$keyperm
770*4882a593Smuzhiyun	lvx		v31,$x00,$key
771*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[2]
772*4882a593Smuzhiyun	addi		$key_,$key_,0x20
773*4882a593Smuzhiyun	bdnz		Load_cbc_dec_key
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun	lvx		v26,$x10,$key
776*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
777*4882a593Smuzhiyun	lvx		v27,$x20,$key
778*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[3]
779*4882a593Smuzhiyun	?vperm		v25,v31,v26,$keyperm
780*4882a593Smuzhiyun	lvx		v28,$x30,$key
781*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[4]
782*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
783*4882a593Smuzhiyun	?vperm		v26,v26,v27,$keyperm
784*4882a593Smuzhiyun	lvx		v29,$x40,$key
785*4882a593Smuzhiyun	?vperm		v27,v27,v28,$keyperm
786*4882a593Smuzhiyun	lvx		v30,$x50,$key
787*4882a593Smuzhiyun	?vperm		v28,v28,v29,$keyperm
788*4882a593Smuzhiyun	lvx		v31,$x60,$key
789*4882a593Smuzhiyun	?vperm		v29,v29,v30,$keyperm
790*4882a593Smuzhiyun	lvx		$out0,$x70,$key		# borrow $out0
791*4882a593Smuzhiyun	?vperm		v30,v30,v31,$keyperm
792*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# pre-load round[1]
793*4882a593Smuzhiyun	?vperm		v31,v31,$out0,$keyperm
794*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# pre-load round[2]
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun	#lvx		$inptail,0,$inp		# "caller" already did this
797*4882a593Smuzhiyun	#addi		$inp,$inp,15		# 15 is not typo
798*4882a593Smuzhiyun	subi		$inp,$inp,15		# undo "caller"
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun	 le?li		$idx,8
801*4882a593Smuzhiyun	lvx_u		$in0,$x00,$inp		# load first 8 "words"
802*4882a593Smuzhiyun	 le?lvsl	$inpperm,0,$idx
803*4882a593Smuzhiyun	 le?vspltisb	$tmp,0x0f
804*4882a593Smuzhiyun	lvx_u		$in1,$x10,$inp
805*4882a593Smuzhiyun	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
806*4882a593Smuzhiyun	lvx_u		$in2,$x20,$inp
807*4882a593Smuzhiyun	 le?vperm	$in0,$in0,$in0,$inpperm
808*4882a593Smuzhiyun	lvx_u		$in3,$x30,$inp
809*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$inpperm
810*4882a593Smuzhiyun	lvx_u		$in4,$x40,$inp
811*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$inpperm
812*4882a593Smuzhiyun	vxor		$out0,$in0,$rndkey0
813*4882a593Smuzhiyun	lvx_u		$in5,$x50,$inp
814*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$inpperm
815*4882a593Smuzhiyun	vxor		$out1,$in1,$rndkey0
816*4882a593Smuzhiyun	lvx_u		$in6,$x60,$inp
817*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$inpperm
818*4882a593Smuzhiyun	vxor		$out2,$in2,$rndkey0
819*4882a593Smuzhiyun	lvx_u		$in7,$x70,$inp
820*4882a593Smuzhiyun	addi		$inp,$inp,0x80
821*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$inpperm
822*4882a593Smuzhiyun	vxor		$out3,$in3,$rndkey0
823*4882a593Smuzhiyun	 le?vperm	$in6,$in6,$in6,$inpperm
824*4882a593Smuzhiyun	vxor		$out4,$in4,$rndkey0
825*4882a593Smuzhiyun	 le?vperm	$in7,$in7,$in7,$inpperm
826*4882a593Smuzhiyun	vxor		$out5,$in5,$rndkey0
827*4882a593Smuzhiyun	vxor		$out6,$in6,$rndkey0
828*4882a593Smuzhiyun	vxor		$out7,$in7,$rndkey0
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun	mtctr		$rounds
831*4882a593Smuzhiyun	b		Loop_cbc_dec8x
832*4882a593Smuzhiyun.align	5
833*4882a593SmuzhiyunLoop_cbc_dec8x:
834*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
835*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
836*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
837*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
838*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
839*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
840*4882a593Smuzhiyun	vncipher	$out6,$out6,v24
841*4882a593Smuzhiyun	vncipher	$out7,$out7,v24
842*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
843*4882a593Smuzhiyun	addi		$key_,$key_,0x20
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
846*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
847*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
848*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
849*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
850*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
851*4882a593Smuzhiyun	vncipher	$out6,$out6,v25
852*4882a593Smuzhiyun	vncipher	$out7,$out7,v25
853*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
854*4882a593Smuzhiyun	bdnz		Loop_cbc_dec8x
855*4882a593Smuzhiyun
856*4882a593Smuzhiyun	subic		$len,$len,128		# $len-=128
857*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
858*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
859*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
860*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
861*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
862*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
863*4882a593Smuzhiyun	vncipher	$out6,$out6,v24
864*4882a593Smuzhiyun	vncipher	$out7,$out7,v24
865*4882a593Smuzhiyun
866*4882a593Smuzhiyun	subfe.		r0,r0,r0		# borrow?-1:0
867*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
868*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
869*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
870*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
871*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
872*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
873*4882a593Smuzhiyun	vncipher	$out6,$out6,v25
874*4882a593Smuzhiyun	vncipher	$out7,$out7,v25
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun	and		r0,r0,$len
877*4882a593Smuzhiyun	vncipher	$out0,$out0,v26
878*4882a593Smuzhiyun	vncipher	$out1,$out1,v26
879*4882a593Smuzhiyun	vncipher	$out2,$out2,v26
880*4882a593Smuzhiyun	vncipher	$out3,$out3,v26
881*4882a593Smuzhiyun	vncipher	$out4,$out4,v26
882*4882a593Smuzhiyun	vncipher	$out5,$out5,v26
883*4882a593Smuzhiyun	vncipher	$out6,$out6,v26
884*4882a593Smuzhiyun	vncipher	$out7,$out7,v26
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun	add		$inp,$inp,r0		# $inp is adjusted in such
887*4882a593Smuzhiyun						# way that at exit from the
888*4882a593Smuzhiyun						# loop inX-in7 are loaded
889*4882a593Smuzhiyun						# with last "words"
890*4882a593Smuzhiyun	vncipher	$out0,$out0,v27
891*4882a593Smuzhiyun	vncipher	$out1,$out1,v27
892*4882a593Smuzhiyun	vncipher	$out2,$out2,v27
893*4882a593Smuzhiyun	vncipher	$out3,$out3,v27
894*4882a593Smuzhiyun	vncipher	$out4,$out4,v27
895*4882a593Smuzhiyun	vncipher	$out5,$out5,v27
896*4882a593Smuzhiyun	vncipher	$out6,$out6,v27
897*4882a593Smuzhiyun	vncipher	$out7,$out7,v27
898*4882a593Smuzhiyun
899*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
900*4882a593Smuzhiyun	vncipher	$out0,$out0,v28
901*4882a593Smuzhiyun	vncipher	$out1,$out1,v28
902*4882a593Smuzhiyun	vncipher	$out2,$out2,v28
903*4882a593Smuzhiyun	vncipher	$out3,$out3,v28
904*4882a593Smuzhiyun	vncipher	$out4,$out4,v28
905*4882a593Smuzhiyun	vncipher	$out5,$out5,v28
906*4882a593Smuzhiyun	vncipher	$out6,$out6,v28
907*4882a593Smuzhiyun	vncipher	$out7,$out7,v28
908*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
909*4882a593Smuzhiyun
910*4882a593Smuzhiyun	vncipher	$out0,$out0,v29
911*4882a593Smuzhiyun	vncipher	$out1,$out1,v29
912*4882a593Smuzhiyun	vncipher	$out2,$out2,v29
913*4882a593Smuzhiyun	vncipher	$out3,$out3,v29
914*4882a593Smuzhiyun	vncipher	$out4,$out4,v29
915*4882a593Smuzhiyun	vncipher	$out5,$out5,v29
916*4882a593Smuzhiyun	vncipher	$out6,$out6,v29
917*4882a593Smuzhiyun	vncipher	$out7,$out7,v29
918*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
919*4882a593Smuzhiyun
920*4882a593Smuzhiyun	vncipher	$out0,$out0,v30
921*4882a593Smuzhiyun	 vxor		$ivec,$ivec,v31		# xor with last round key
922*4882a593Smuzhiyun	vncipher	$out1,$out1,v30
923*4882a593Smuzhiyun	 vxor		$in0,$in0,v31
924*4882a593Smuzhiyun	vncipher	$out2,$out2,v30
925*4882a593Smuzhiyun	 vxor		$in1,$in1,v31
926*4882a593Smuzhiyun	vncipher	$out3,$out3,v30
927*4882a593Smuzhiyun	 vxor		$in2,$in2,v31
928*4882a593Smuzhiyun	vncipher	$out4,$out4,v30
929*4882a593Smuzhiyun	 vxor		$in3,$in3,v31
930*4882a593Smuzhiyun	vncipher	$out5,$out5,v30
931*4882a593Smuzhiyun	 vxor		$in4,$in4,v31
932*4882a593Smuzhiyun	vncipher	$out6,$out6,v30
933*4882a593Smuzhiyun	 vxor		$in5,$in5,v31
934*4882a593Smuzhiyun	vncipher	$out7,$out7,v30
935*4882a593Smuzhiyun	 vxor		$in6,$in6,v31
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun	vncipherlast	$out0,$out0,$ivec
938*4882a593Smuzhiyun	vncipherlast	$out1,$out1,$in0
939*4882a593Smuzhiyun	 lvx_u		$in0,$x00,$inp		# load next input block
940*4882a593Smuzhiyun	vncipherlast	$out2,$out2,$in1
941*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
942*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$in2
943*4882a593Smuzhiyun	 le?vperm	$in0,$in0,$in0,$inpperm
944*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
945*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in3
946*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$inpperm
947*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
948*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in4
949*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$inpperm
950*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
951*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
952*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$inpperm
953*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
954*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
955*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$inpperm
956*4882a593Smuzhiyun	 lvx_u		$in6,$x60,$inp
957*4882a593Smuzhiyun	vmr		$ivec,$in7
958*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$inpperm
959*4882a593Smuzhiyun	 lvx_u		$in7,$x70,$inp
960*4882a593Smuzhiyun	 addi		$inp,$inp,0x80
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
963*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
964*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
965*4882a593Smuzhiyun	 le?vperm	$in6,$in6,$in6,$inpperm
966*4882a593Smuzhiyun	 vxor		$out0,$in0,$rndkey0
967*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
968*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
969*4882a593Smuzhiyun	 le?vperm	$in7,$in7,$in7,$inpperm
970*4882a593Smuzhiyun	 vxor		$out1,$in1,$rndkey0
971*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
972*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
973*4882a593Smuzhiyun	 vxor		$out2,$in2,$rndkey0
974*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
975*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
976*4882a593Smuzhiyun	 vxor		$out3,$in3,$rndkey0
977*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
978*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
979*4882a593Smuzhiyun	 vxor		$out4,$in4,$rndkey0
980*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
981*4882a593Smuzhiyun	stvx_u		$out5,$x50,$out
982*4882a593Smuzhiyun	 vxor		$out5,$in5,$rndkey0
983*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
984*4882a593Smuzhiyun	stvx_u		$out6,$x60,$out
985*4882a593Smuzhiyun	 vxor		$out6,$in6,$rndkey0
986*4882a593Smuzhiyun	stvx_u		$out7,$x70,$out
987*4882a593Smuzhiyun	addi		$out,$out,0x80
988*4882a593Smuzhiyun	 vxor		$out7,$in7,$rndkey0
989*4882a593Smuzhiyun
990*4882a593Smuzhiyun	mtctr		$rounds
991*4882a593Smuzhiyun	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
992*4882a593Smuzhiyun
993*4882a593Smuzhiyun	addic.		$len,$len,128
994*4882a593Smuzhiyun	beq		Lcbc_dec8x_done
995*4882a593Smuzhiyun	nop
996*4882a593Smuzhiyun	nop
997*4882a593Smuzhiyun
998*4882a593SmuzhiyunLoop_cbc_dec8x_tail:				# up to 7 "words" tail...
999*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
1000*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
1001*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
1002*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
1003*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
1004*4882a593Smuzhiyun	vncipher	$out6,$out6,v24
1005*4882a593Smuzhiyun	vncipher	$out7,$out7,v24
1006*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
1007*4882a593Smuzhiyun	addi		$key_,$key_,0x20
1008*4882a593Smuzhiyun
1009*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
1010*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
1011*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
1012*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
1013*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
1014*4882a593Smuzhiyun	vncipher	$out6,$out6,v25
1015*4882a593Smuzhiyun	vncipher	$out7,$out7,v25
1016*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
1017*4882a593Smuzhiyun	bdnz		Loop_cbc_dec8x_tail
1018*4882a593Smuzhiyun
1019*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
1020*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
1021*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
1022*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
1023*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
1024*4882a593Smuzhiyun	vncipher	$out6,$out6,v24
1025*4882a593Smuzhiyun	vncipher	$out7,$out7,v24
1026*4882a593Smuzhiyun
1027*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
1028*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
1029*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
1030*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
1031*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
1032*4882a593Smuzhiyun	vncipher	$out6,$out6,v25
1033*4882a593Smuzhiyun	vncipher	$out7,$out7,v25
1034*4882a593Smuzhiyun
1035*4882a593Smuzhiyun	vncipher	$out1,$out1,v26
1036*4882a593Smuzhiyun	vncipher	$out2,$out2,v26
1037*4882a593Smuzhiyun	vncipher	$out3,$out3,v26
1038*4882a593Smuzhiyun	vncipher	$out4,$out4,v26
1039*4882a593Smuzhiyun	vncipher	$out5,$out5,v26
1040*4882a593Smuzhiyun	vncipher	$out6,$out6,v26
1041*4882a593Smuzhiyun	vncipher	$out7,$out7,v26
1042*4882a593Smuzhiyun
1043*4882a593Smuzhiyun	vncipher	$out1,$out1,v27
1044*4882a593Smuzhiyun	vncipher	$out2,$out2,v27
1045*4882a593Smuzhiyun	vncipher	$out3,$out3,v27
1046*4882a593Smuzhiyun	vncipher	$out4,$out4,v27
1047*4882a593Smuzhiyun	vncipher	$out5,$out5,v27
1048*4882a593Smuzhiyun	vncipher	$out6,$out6,v27
1049*4882a593Smuzhiyun	vncipher	$out7,$out7,v27
1050*4882a593Smuzhiyun
1051*4882a593Smuzhiyun	vncipher	$out1,$out1,v28
1052*4882a593Smuzhiyun	vncipher	$out2,$out2,v28
1053*4882a593Smuzhiyun	vncipher	$out3,$out3,v28
1054*4882a593Smuzhiyun	vncipher	$out4,$out4,v28
1055*4882a593Smuzhiyun	vncipher	$out5,$out5,v28
1056*4882a593Smuzhiyun	vncipher	$out6,$out6,v28
1057*4882a593Smuzhiyun	vncipher	$out7,$out7,v28
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun	vncipher	$out1,$out1,v29
1060*4882a593Smuzhiyun	vncipher	$out2,$out2,v29
1061*4882a593Smuzhiyun	vncipher	$out3,$out3,v29
1062*4882a593Smuzhiyun	vncipher	$out4,$out4,v29
1063*4882a593Smuzhiyun	vncipher	$out5,$out5,v29
1064*4882a593Smuzhiyun	vncipher	$out6,$out6,v29
1065*4882a593Smuzhiyun	vncipher	$out7,$out7,v29
1066*4882a593Smuzhiyun
1067*4882a593Smuzhiyun	vncipher	$out1,$out1,v30
1068*4882a593Smuzhiyun	 vxor		$ivec,$ivec,v31		# last round key
1069*4882a593Smuzhiyun	vncipher	$out2,$out2,v30
1070*4882a593Smuzhiyun	 vxor		$in1,$in1,v31
1071*4882a593Smuzhiyun	vncipher	$out3,$out3,v30
1072*4882a593Smuzhiyun	 vxor		$in2,$in2,v31
1073*4882a593Smuzhiyun	vncipher	$out4,$out4,v30
1074*4882a593Smuzhiyun	 vxor		$in3,$in3,v31
1075*4882a593Smuzhiyun	vncipher	$out5,$out5,v30
1076*4882a593Smuzhiyun	 vxor		$in4,$in4,v31
1077*4882a593Smuzhiyun	vncipher	$out6,$out6,v30
1078*4882a593Smuzhiyun	 vxor		$in5,$in5,v31
1079*4882a593Smuzhiyun	vncipher	$out7,$out7,v30
1080*4882a593Smuzhiyun	 vxor		$in6,$in6,v31
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun	cmplwi		$len,32			# switch($len)
1083*4882a593Smuzhiyun	blt		Lcbc_dec8x_one
1084*4882a593Smuzhiyun	nop
1085*4882a593Smuzhiyun	beq		Lcbc_dec8x_two
1086*4882a593Smuzhiyun	cmplwi		$len,64
1087*4882a593Smuzhiyun	blt		Lcbc_dec8x_three
1088*4882a593Smuzhiyun	nop
1089*4882a593Smuzhiyun	beq		Lcbc_dec8x_four
1090*4882a593Smuzhiyun	cmplwi		$len,96
1091*4882a593Smuzhiyun	blt		Lcbc_dec8x_five
1092*4882a593Smuzhiyun	nop
1093*4882a593Smuzhiyun	beq		Lcbc_dec8x_six
1094*4882a593Smuzhiyun
1095*4882a593SmuzhiyunLcbc_dec8x_seven:
1096*4882a593Smuzhiyun	vncipherlast	$out1,$out1,$ivec
1097*4882a593Smuzhiyun	vncipherlast	$out2,$out2,$in1
1098*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$in2
1099*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in3
1100*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in4
1101*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
1102*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1103*4882a593Smuzhiyun	vmr		$ivec,$in7
1104*4882a593Smuzhiyun
1105*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1106*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1107*4882a593Smuzhiyun	stvx_u		$out1,$x00,$out
1108*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1109*4882a593Smuzhiyun	stvx_u		$out2,$x10,$out
1110*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1111*4882a593Smuzhiyun	stvx_u		$out3,$x20,$out
1112*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1113*4882a593Smuzhiyun	stvx_u		$out4,$x30,$out
1114*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1115*4882a593Smuzhiyun	stvx_u		$out5,$x40,$out
1116*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1117*4882a593Smuzhiyun	stvx_u		$out6,$x50,$out
1118*4882a593Smuzhiyun	stvx_u		$out7,$x60,$out
1119*4882a593Smuzhiyun	addi		$out,$out,0x70
1120*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1121*4882a593Smuzhiyun
1122*4882a593Smuzhiyun.align	5
1123*4882a593SmuzhiyunLcbc_dec8x_six:
1124*4882a593Smuzhiyun	vncipherlast	$out2,$out2,$ivec
1125*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$in2
1126*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in3
1127*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in4
1128*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
1129*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1130*4882a593Smuzhiyun	vmr		$ivec,$in7
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1133*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1134*4882a593Smuzhiyun	stvx_u		$out2,$x00,$out
1135*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1136*4882a593Smuzhiyun	stvx_u		$out3,$x10,$out
1137*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1138*4882a593Smuzhiyun	stvx_u		$out4,$x20,$out
1139*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1140*4882a593Smuzhiyun	stvx_u		$out5,$x30,$out
1141*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1142*4882a593Smuzhiyun	stvx_u		$out6,$x40,$out
1143*4882a593Smuzhiyun	stvx_u		$out7,$x50,$out
1144*4882a593Smuzhiyun	addi		$out,$out,0x60
1145*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1146*4882a593Smuzhiyun
1147*4882a593Smuzhiyun.align	5
1148*4882a593SmuzhiyunLcbc_dec8x_five:
1149*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$ivec
1150*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in3
1151*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in4
1152*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
1153*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1154*4882a593Smuzhiyun	vmr		$ivec,$in7
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1157*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1158*4882a593Smuzhiyun	stvx_u		$out3,$x00,$out
1159*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1160*4882a593Smuzhiyun	stvx_u		$out4,$x10,$out
1161*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1162*4882a593Smuzhiyun	stvx_u		$out5,$x20,$out
1163*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1164*4882a593Smuzhiyun	stvx_u		$out6,$x30,$out
1165*4882a593Smuzhiyun	stvx_u		$out7,$x40,$out
1166*4882a593Smuzhiyun	addi		$out,$out,0x50
1167*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1168*4882a593Smuzhiyun
1169*4882a593Smuzhiyun.align	5
1170*4882a593SmuzhiyunLcbc_dec8x_four:
1171*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$ivec
1172*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in4
1173*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
1174*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1175*4882a593Smuzhiyun	vmr		$ivec,$in7
1176*4882a593Smuzhiyun
1177*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1178*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1179*4882a593Smuzhiyun	stvx_u		$out4,$x00,$out
1180*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1181*4882a593Smuzhiyun	stvx_u		$out5,$x10,$out
1182*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1183*4882a593Smuzhiyun	stvx_u		$out6,$x20,$out
1184*4882a593Smuzhiyun	stvx_u		$out7,$x30,$out
1185*4882a593Smuzhiyun	addi		$out,$out,0x40
1186*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1187*4882a593Smuzhiyun
1188*4882a593Smuzhiyun.align	5
1189*4882a593SmuzhiyunLcbc_dec8x_three:
1190*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$ivec
1191*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$in5
1192*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1193*4882a593Smuzhiyun	vmr		$ivec,$in7
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1196*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1197*4882a593Smuzhiyun	stvx_u		$out5,$x00,$out
1198*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1199*4882a593Smuzhiyun	stvx_u		$out6,$x10,$out
1200*4882a593Smuzhiyun	stvx_u		$out7,$x20,$out
1201*4882a593Smuzhiyun	addi		$out,$out,0x30
1202*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1203*4882a593Smuzhiyun
1204*4882a593Smuzhiyun.align	5
1205*4882a593SmuzhiyunLcbc_dec8x_two:
1206*4882a593Smuzhiyun	vncipherlast	$out6,$out6,$ivec
1207*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$in6
1208*4882a593Smuzhiyun	vmr		$ivec,$in7
1209*4882a593Smuzhiyun
1210*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1211*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1212*4882a593Smuzhiyun	stvx_u		$out6,$x00,$out
1213*4882a593Smuzhiyun	stvx_u		$out7,$x10,$out
1214*4882a593Smuzhiyun	addi		$out,$out,0x20
1215*4882a593Smuzhiyun	b		Lcbc_dec8x_done
1216*4882a593Smuzhiyun
1217*4882a593Smuzhiyun.align	5
1218*4882a593SmuzhiyunLcbc_dec8x_one:
1219*4882a593Smuzhiyun	vncipherlast	$out7,$out7,$ivec
1220*4882a593Smuzhiyun	vmr		$ivec,$in7
1221*4882a593Smuzhiyun
1222*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1223*4882a593Smuzhiyun	stvx_u		$out7,0,$out
1224*4882a593Smuzhiyun	addi		$out,$out,0x10
1225*4882a593Smuzhiyun
1226*4882a593SmuzhiyunLcbc_dec8x_done:
1227*4882a593Smuzhiyun	le?vperm	$ivec,$ivec,$ivec,$inpperm
1228*4882a593Smuzhiyun	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun	li		r10,`$FRAME+15`
1231*4882a593Smuzhiyun	li		r11,`$FRAME+31`
1232*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1233*4882a593Smuzhiyun	addi		r10,r10,32
1234*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1235*4882a593Smuzhiyun	addi		r11,r11,32
1236*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1237*4882a593Smuzhiyun	addi		r10,r10,32
1238*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1239*4882a593Smuzhiyun	addi		r11,r11,32
1240*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1241*4882a593Smuzhiyun	addi		r10,r10,32
1242*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1243*4882a593Smuzhiyun	addi		r11,r11,32
1244*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1245*4882a593Smuzhiyun	addi		r10,r10,32
1246*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1247*4882a593Smuzhiyun	addi		r11,r11,32
1248*4882a593Smuzhiyun
1249*4882a593Smuzhiyun	mtspr		256,$vrsave
1250*4882a593Smuzhiyun	lvx		v20,r10,$sp		# ABI says so
1251*4882a593Smuzhiyun	addi		r10,r10,32
1252*4882a593Smuzhiyun	lvx		v21,r11,$sp
1253*4882a593Smuzhiyun	addi		r11,r11,32
1254*4882a593Smuzhiyun	lvx		v22,r10,$sp
1255*4882a593Smuzhiyun	addi		r10,r10,32
1256*4882a593Smuzhiyun	lvx		v23,r11,$sp
1257*4882a593Smuzhiyun	addi		r11,r11,32
1258*4882a593Smuzhiyun	lvx		v24,r10,$sp
1259*4882a593Smuzhiyun	addi		r10,r10,32
1260*4882a593Smuzhiyun	lvx		v25,r11,$sp
1261*4882a593Smuzhiyun	addi		r11,r11,32
1262*4882a593Smuzhiyun	lvx		v26,r10,$sp
1263*4882a593Smuzhiyun	addi		r10,r10,32
1264*4882a593Smuzhiyun	lvx		v27,r11,$sp
1265*4882a593Smuzhiyun	addi		r11,r11,32
1266*4882a593Smuzhiyun	lvx		v28,r10,$sp
1267*4882a593Smuzhiyun	addi		r10,r10,32
1268*4882a593Smuzhiyun	lvx		v29,r11,$sp
1269*4882a593Smuzhiyun	addi		r11,r11,32
1270*4882a593Smuzhiyun	lvx		v30,r10,$sp
1271*4882a593Smuzhiyun	lvx		v31,r11,$sp
1272*4882a593Smuzhiyun	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1273*4882a593Smuzhiyun	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1274*4882a593Smuzhiyun	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1275*4882a593Smuzhiyun	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1276*4882a593Smuzhiyun	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1277*4882a593Smuzhiyun	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1278*4882a593Smuzhiyun	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1279*4882a593Smuzhiyun	blr
1280*4882a593Smuzhiyun	.long		0
1281*4882a593Smuzhiyun	.byte		0,12,0x14,0,0x80,6,6,0
1282*4882a593Smuzhiyun	.long		0
1283*4882a593Smuzhiyun.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1284*4882a593Smuzhiyun___
1285*4882a593Smuzhiyun}}	}}}
1286*4882a593Smuzhiyun
1287*4882a593Smuzhiyun#########################################################################
1288*4882a593Smuzhiyun{{{	# CTR procedure[s]						#
1289*4882a593Smuzhiyun
1290*4882a593Smuzhiyun####################### WARNING: Here be dragons! #######################
1291*4882a593Smuzhiyun#
1292*4882a593Smuzhiyun# This code is written as 'ctr32', based on a 32-bit counter used
1293*4882a593Smuzhiyun# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
1294*4882a593Smuzhiyun# a 128-bit counter.
1295*4882a593Smuzhiyun#
1296*4882a593Smuzhiyun# This leads to subtle changes from the upstream code: the counter
1297*4882a593Smuzhiyun# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
1298*4882a593Smuzhiyun# both the bulk (8 blocks at a time) path, and in the individual block
1299*4882a593Smuzhiyun# path. Be aware of this when doing updates.
1300*4882a593Smuzhiyun#
1301*4882a593Smuzhiyun# See:
1302*4882a593Smuzhiyun# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
1303*4882a593Smuzhiyun# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
1304*4882a593Smuzhiyun# https://github.com/openssl/openssl/pull/8942
1305*4882a593Smuzhiyun#
1306*4882a593Smuzhiyun#########################################################################
1307*4882a593Smuzhiyunmy ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1308*4882a593Smuzhiyunmy ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1309*4882a593Smuzhiyunmy ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1310*4882a593Smuzhiyun						map("v$_",(4..11));
1311*4882a593Smuzhiyunmy $dat=$tmp;
1312*4882a593Smuzhiyun
1313*4882a593Smuzhiyun$code.=<<___;
1314*4882a593Smuzhiyun.globl	.${prefix}_ctr32_encrypt_blocks
1315*4882a593Smuzhiyun	${UCMP}i	$len,1
1316*4882a593Smuzhiyun	bltlr-
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun	lis		r0,0xfff0
1319*4882a593Smuzhiyun	mfspr		$vrsave,256
1320*4882a593Smuzhiyun	mtspr		256,r0
1321*4882a593Smuzhiyun
1322*4882a593Smuzhiyun	li		$idx,15
1323*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$rndkey0
1324*4882a593Smuzhiyun	le?vspltisb	$tmp,0x0f
1325*4882a593Smuzhiyun
1326*4882a593Smuzhiyun	lvx		$ivec,0,$ivp		# load [unaligned] iv
1327*4882a593Smuzhiyun	lvsl		$inpperm,0,$ivp
1328*4882a593Smuzhiyun	lvx		$inptail,$idx,$ivp
1329*4882a593Smuzhiyun	 vspltisb	$one,1
1330*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
1331*4882a593Smuzhiyun	vperm		$ivec,$ivec,$inptail,$inpperm
1332*4882a593Smuzhiyun	 vsldoi		$one,$rndkey0,$one,1
1333*4882a593Smuzhiyun
1334*4882a593Smuzhiyun	neg		r11,$inp
1335*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1336*4882a593Smuzhiyun	lwz		$rounds,240($key)
1337*4882a593Smuzhiyun
1338*4882a593Smuzhiyun	lvsr		$inpperm,0,r11		# prepare for unaligned load
1339*4882a593Smuzhiyun	lvx		$inptail,0,$inp
1340*4882a593Smuzhiyun	addi		$inp,$inp,15		# 15 is not typo
1341*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
1342*4882a593Smuzhiyun
1343*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
1344*4882a593Smuzhiyun	li		$idx,16
1345*4882a593Smuzhiyun	subi		$rounds,$rounds,1
1346*4882a593Smuzhiyun
1347*4882a593Smuzhiyun	${UCMP}i	$len,8
1348*4882a593Smuzhiyun	bge		_aesp8_ctr32_encrypt8x
1349*4882a593Smuzhiyun
1350*4882a593Smuzhiyun	?lvsr		$outperm,0,$out		# prepare for unaligned store
1351*4882a593Smuzhiyun	vspltisb	$outmask,-1
1352*4882a593Smuzhiyun	lvx		$outhead,0,$out
1353*4882a593Smuzhiyun	?vperm		$outmask,$rndkey0,$outmask,$outperm
1354*4882a593Smuzhiyun	le?vxor		$outperm,$outperm,$tmp
1355*4882a593Smuzhiyun
1356*4882a593Smuzhiyun	lvx		$rndkey0,0,$key
1357*4882a593Smuzhiyun	mtctr		$rounds
1358*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
1359*4882a593Smuzhiyun	addi		$idx,$idx,16
1360*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1361*4882a593Smuzhiyun	vxor		$inout,$ivec,$rndkey0
1362*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
1363*4882a593Smuzhiyun	addi		$idx,$idx,16
1364*4882a593Smuzhiyun	b		Loop_ctr32_enc
1365*4882a593Smuzhiyun
1366*4882a593Smuzhiyun.align	5
1367*4882a593SmuzhiyunLoop_ctr32_enc:
1368*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1369*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
1370*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
1371*4882a593Smuzhiyun	addi		$idx,$idx,16
1372*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1373*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey0
1374*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key
1375*4882a593Smuzhiyun	addi		$idx,$idx,16
1376*4882a593Smuzhiyun	bdnz		Loop_ctr32_enc
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
1379*4882a593Smuzhiyun	 vmr		$dat,$inptail
1380*4882a593Smuzhiyun	 lvx		$inptail,0,$inp
1381*4882a593Smuzhiyun	 addi		$inp,$inp,16
1382*4882a593Smuzhiyun	 subic.		$len,$len,1		# blocks--
1383*4882a593Smuzhiyun
1384*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1385*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
1386*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key
1387*4882a593Smuzhiyun	 vperm		$dat,$dat,$inptail,$inpperm
1388*4882a593Smuzhiyun	 li		$idx,16
1389*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1390*4882a593Smuzhiyun	 lvx		$rndkey0,0,$key
1391*4882a593Smuzhiyun	vxor		$dat,$dat,$rndkey1	# last round key
1392*4882a593Smuzhiyun	vcipherlast	$inout,$inout,$dat
1393*4882a593Smuzhiyun
1394*4882a593Smuzhiyun	 lvx		$rndkey1,$idx,$key
1395*4882a593Smuzhiyun	 addi		$idx,$idx,16
1396*4882a593Smuzhiyun	vperm		$inout,$inout,$inout,$outperm
1397*4882a593Smuzhiyun	vsel		$dat,$outhead,$inout,$outmask
1398*4882a593Smuzhiyun	 mtctr		$rounds
1399*4882a593Smuzhiyun	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1400*4882a593Smuzhiyun	vmr		$outhead,$inout
1401*4882a593Smuzhiyun	 vxor		$inout,$ivec,$rndkey0
1402*4882a593Smuzhiyun	 lvx		$rndkey0,$idx,$key
1403*4882a593Smuzhiyun	 addi		$idx,$idx,16
1404*4882a593Smuzhiyun	stvx		$dat,0,$out
1405*4882a593Smuzhiyun	addi		$out,$out,16
1406*4882a593Smuzhiyun	bne		Loop_ctr32_enc
1407*4882a593Smuzhiyun
1408*4882a593Smuzhiyun	addi		$out,$out,-1
1409*4882a593Smuzhiyun	lvx		$inout,0,$out		# redundant in aligned case
1410*4882a593Smuzhiyun	vsel		$inout,$outhead,$inout,$outmask
1411*4882a593Smuzhiyun	stvx		$inout,0,$out
1412*4882a593Smuzhiyun
1413*4882a593Smuzhiyun	mtspr		256,$vrsave
1414*4882a593Smuzhiyun	blr
1415*4882a593Smuzhiyun	.long		0
1416*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,6,0
1417*4882a593Smuzhiyun	.long		0
1418*4882a593Smuzhiyun___
1419*4882a593Smuzhiyun#########################################################################
1420*4882a593Smuzhiyun{{	# Optimized CTR procedure					#
1421*4882a593Smuzhiyunmy $key_="r11";
1422*4882a593Smuzhiyunmy ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1423*4882a593Smuzhiyunmy ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1424*4882a593Smuzhiyunmy ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1425*4882a593Smuzhiyunmy $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1426*4882a593Smuzhiyun			# v26-v31 last 6 round keys
1427*4882a593Smuzhiyunmy ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1428*4882a593Smuzhiyunmy ($two,$three,$four)=($outhead,$outperm,$outmask);
1429*4882a593Smuzhiyun
1430*4882a593Smuzhiyun$code.=<<___;
1431*4882a593Smuzhiyun.align	5
1432*4882a593Smuzhiyun_aesp8_ctr32_encrypt8x:
1433*4882a593Smuzhiyun	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1434*4882a593Smuzhiyun	li		r10,`$FRAME+8*16+15`
1435*4882a593Smuzhiyun	li		r11,`$FRAME+8*16+31`
1436*4882a593Smuzhiyun	stvx		v20,r10,$sp		# ABI says so
1437*4882a593Smuzhiyun	addi		r10,r10,32
1438*4882a593Smuzhiyun	stvx		v21,r11,$sp
1439*4882a593Smuzhiyun	addi		r11,r11,32
1440*4882a593Smuzhiyun	stvx		v22,r10,$sp
1441*4882a593Smuzhiyun	addi		r10,r10,32
1442*4882a593Smuzhiyun	stvx		v23,r11,$sp
1443*4882a593Smuzhiyun	addi		r11,r11,32
1444*4882a593Smuzhiyun	stvx		v24,r10,$sp
1445*4882a593Smuzhiyun	addi		r10,r10,32
1446*4882a593Smuzhiyun	stvx		v25,r11,$sp
1447*4882a593Smuzhiyun	addi		r11,r11,32
1448*4882a593Smuzhiyun	stvx		v26,r10,$sp
1449*4882a593Smuzhiyun	addi		r10,r10,32
1450*4882a593Smuzhiyun	stvx		v27,r11,$sp
1451*4882a593Smuzhiyun	addi		r11,r11,32
1452*4882a593Smuzhiyun	stvx		v28,r10,$sp
1453*4882a593Smuzhiyun	addi		r10,r10,32
1454*4882a593Smuzhiyun	stvx		v29,r11,$sp
1455*4882a593Smuzhiyun	addi		r11,r11,32
1456*4882a593Smuzhiyun	stvx		v30,r10,$sp
1457*4882a593Smuzhiyun	stvx		v31,r11,$sp
1458*4882a593Smuzhiyun	li		r0,-1
1459*4882a593Smuzhiyun	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1460*4882a593Smuzhiyun	li		$x10,0x10
1461*4882a593Smuzhiyun	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1462*4882a593Smuzhiyun	li		$x20,0x20
1463*4882a593Smuzhiyun	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1464*4882a593Smuzhiyun	li		$x30,0x30
1465*4882a593Smuzhiyun	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1466*4882a593Smuzhiyun	li		$x40,0x40
1467*4882a593Smuzhiyun	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1468*4882a593Smuzhiyun	li		$x50,0x50
1469*4882a593Smuzhiyun	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1470*4882a593Smuzhiyun	li		$x60,0x60
1471*4882a593Smuzhiyun	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1472*4882a593Smuzhiyun	li		$x70,0x70
1473*4882a593Smuzhiyun	mtspr		256,r0
1474*4882a593Smuzhiyun
1475*4882a593Smuzhiyun	subi		$rounds,$rounds,3	# -4 in total
1476*4882a593Smuzhiyun
1477*4882a593Smuzhiyun	lvx		$rndkey0,$x00,$key	# load key schedule
1478*4882a593Smuzhiyun	lvx		v30,$x10,$key
1479*4882a593Smuzhiyun	addi		$key,$key,0x20
1480*4882a593Smuzhiyun	lvx		v31,$x00,$key
1481*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1482*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15
1483*4882a593Smuzhiyun	mtctr		$rounds
1484*4882a593Smuzhiyun
1485*4882a593SmuzhiyunLoad_ctr32_enc_key:
1486*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
1487*4882a593Smuzhiyun	lvx		v30,$x10,$key
1488*4882a593Smuzhiyun	addi		$key,$key,0x20
1489*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[1]
1490*4882a593Smuzhiyun	?vperm		v25,v31,v30,$keyperm
1491*4882a593Smuzhiyun	lvx		v31,$x00,$key
1492*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[2]
1493*4882a593Smuzhiyun	addi		$key_,$key_,0x20
1494*4882a593Smuzhiyun	bdnz		Load_ctr32_enc_key
1495*4882a593Smuzhiyun
1496*4882a593Smuzhiyun	lvx		v26,$x10,$key
1497*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
1498*4882a593Smuzhiyun	lvx		v27,$x20,$key
1499*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[3]
1500*4882a593Smuzhiyun	?vperm		v25,v31,v26,$keyperm
1501*4882a593Smuzhiyun	lvx		v28,$x30,$key
1502*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[4]
1503*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
1504*4882a593Smuzhiyun	?vperm		v26,v26,v27,$keyperm
1505*4882a593Smuzhiyun	lvx		v29,$x40,$key
1506*4882a593Smuzhiyun	?vperm		v27,v27,v28,$keyperm
1507*4882a593Smuzhiyun	lvx		v30,$x50,$key
1508*4882a593Smuzhiyun	?vperm		v28,v28,v29,$keyperm
1509*4882a593Smuzhiyun	lvx		v31,$x60,$key
1510*4882a593Smuzhiyun	?vperm		v29,v29,v30,$keyperm
1511*4882a593Smuzhiyun	lvx		$out0,$x70,$key		# borrow $out0
1512*4882a593Smuzhiyun	?vperm		v30,v30,v31,$keyperm
1513*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# pre-load round[1]
1514*4882a593Smuzhiyun	?vperm		v31,v31,$out0,$keyperm
1515*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# pre-load round[2]
1516*4882a593Smuzhiyun
1517*4882a593Smuzhiyun	vadduqm		$two,$one,$one
1518*4882a593Smuzhiyun	subi		$inp,$inp,15		# undo "caller"
1519*4882a593Smuzhiyun	$SHL		$len,$len,4
1520*4882a593Smuzhiyun
1521*4882a593Smuzhiyun	vadduqm		$out1,$ivec,$one	# counter values ...
1522*4882a593Smuzhiyun	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
1523*4882a593Smuzhiyun	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1524*4882a593Smuzhiyun	 le?li		$idx,8
1525*4882a593Smuzhiyun	vadduqm		$out3,$out1,$two
1526*4882a593Smuzhiyun	vxor		$out1,$out1,$rndkey0
1527*4882a593Smuzhiyun	 le?lvsl	$inpperm,0,$idx
1528*4882a593Smuzhiyun	vadduqm		$out4,$out2,$two
1529*4882a593Smuzhiyun	vxor		$out2,$out2,$rndkey0
1530*4882a593Smuzhiyun	 le?vspltisb	$tmp,0x0f
1531*4882a593Smuzhiyun	vadduqm		$out5,$out3,$two
1532*4882a593Smuzhiyun	vxor		$out3,$out3,$rndkey0
1533*4882a593Smuzhiyun	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1534*4882a593Smuzhiyun	vadduqm		$out6,$out4,$two
1535*4882a593Smuzhiyun	vxor		$out4,$out4,$rndkey0
1536*4882a593Smuzhiyun	vadduqm		$out7,$out5,$two
1537*4882a593Smuzhiyun	vxor		$out5,$out5,$rndkey0
1538*4882a593Smuzhiyun	vadduqm		$ivec,$out6,$two	# next counter value
1539*4882a593Smuzhiyun	vxor		$out6,$out6,$rndkey0
1540*4882a593Smuzhiyun	vxor		$out7,$out7,$rndkey0
1541*4882a593Smuzhiyun
1542*4882a593Smuzhiyun	mtctr		$rounds
1543*4882a593Smuzhiyun	b		Loop_ctr32_enc8x
1544*4882a593Smuzhiyun.align	5
1545*4882a593SmuzhiyunLoop_ctr32_enc8x:
1546*4882a593Smuzhiyun	vcipher 	$out0,$out0,v24
1547*4882a593Smuzhiyun	vcipher 	$out1,$out1,v24
1548*4882a593Smuzhiyun	vcipher 	$out2,$out2,v24
1549*4882a593Smuzhiyun	vcipher 	$out3,$out3,v24
1550*4882a593Smuzhiyun	vcipher 	$out4,$out4,v24
1551*4882a593Smuzhiyun	vcipher 	$out5,$out5,v24
1552*4882a593Smuzhiyun	vcipher 	$out6,$out6,v24
1553*4882a593Smuzhiyun	vcipher 	$out7,$out7,v24
1554*4882a593SmuzhiyunLoop_ctr32_enc8x_middle:
1555*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
1556*4882a593Smuzhiyun	addi		$key_,$key_,0x20
1557*4882a593Smuzhiyun
1558*4882a593Smuzhiyun	vcipher 	$out0,$out0,v25
1559*4882a593Smuzhiyun	vcipher 	$out1,$out1,v25
1560*4882a593Smuzhiyun	vcipher 	$out2,$out2,v25
1561*4882a593Smuzhiyun	vcipher 	$out3,$out3,v25
1562*4882a593Smuzhiyun	vcipher 	$out4,$out4,v25
1563*4882a593Smuzhiyun	vcipher 	$out5,$out5,v25
1564*4882a593Smuzhiyun	vcipher 	$out6,$out6,v25
1565*4882a593Smuzhiyun	vcipher 	$out7,$out7,v25
1566*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
1567*4882a593Smuzhiyun	bdnz		Loop_ctr32_enc8x
1568*4882a593Smuzhiyun
1569*4882a593Smuzhiyun	subic		r11,$len,256		# $len-256, borrow $key_
1570*4882a593Smuzhiyun	vcipher 	$out0,$out0,v24
1571*4882a593Smuzhiyun	vcipher 	$out1,$out1,v24
1572*4882a593Smuzhiyun	vcipher 	$out2,$out2,v24
1573*4882a593Smuzhiyun	vcipher 	$out3,$out3,v24
1574*4882a593Smuzhiyun	vcipher 	$out4,$out4,v24
1575*4882a593Smuzhiyun	vcipher 	$out5,$out5,v24
1576*4882a593Smuzhiyun	vcipher 	$out6,$out6,v24
1577*4882a593Smuzhiyun	vcipher 	$out7,$out7,v24
1578*4882a593Smuzhiyun
1579*4882a593Smuzhiyun	subfe		r0,r0,r0		# borrow?-1:0
1580*4882a593Smuzhiyun	vcipher 	$out0,$out0,v25
1581*4882a593Smuzhiyun	vcipher 	$out1,$out1,v25
1582*4882a593Smuzhiyun	vcipher 	$out2,$out2,v25
1583*4882a593Smuzhiyun	vcipher 	$out3,$out3,v25
1584*4882a593Smuzhiyun	vcipher 	$out4,$out4,v25
1585*4882a593Smuzhiyun	vcipher		$out5,$out5,v25
1586*4882a593Smuzhiyun	vcipher		$out6,$out6,v25
1587*4882a593Smuzhiyun	vcipher		$out7,$out7,v25
1588*4882a593Smuzhiyun
1589*4882a593Smuzhiyun	and		r0,r0,r11
1590*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
1591*4882a593Smuzhiyun	vcipher		$out0,$out0,v26
1592*4882a593Smuzhiyun	vcipher		$out1,$out1,v26
1593*4882a593Smuzhiyun	vcipher		$out2,$out2,v26
1594*4882a593Smuzhiyun	vcipher		$out3,$out3,v26
1595*4882a593Smuzhiyun	vcipher		$out4,$out4,v26
1596*4882a593Smuzhiyun	vcipher		$out5,$out5,v26
1597*4882a593Smuzhiyun	vcipher		$out6,$out6,v26
1598*4882a593Smuzhiyun	vcipher		$out7,$out7,v26
1599*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
1600*4882a593Smuzhiyun
1601*4882a593Smuzhiyun	subic		$len,$len,129		# $len-=129
1602*4882a593Smuzhiyun	vcipher		$out0,$out0,v27
1603*4882a593Smuzhiyun	addi		$len,$len,1		# $len-=128 really
1604*4882a593Smuzhiyun	vcipher		$out1,$out1,v27
1605*4882a593Smuzhiyun	vcipher		$out2,$out2,v27
1606*4882a593Smuzhiyun	vcipher		$out3,$out3,v27
1607*4882a593Smuzhiyun	vcipher		$out4,$out4,v27
1608*4882a593Smuzhiyun	vcipher		$out5,$out5,v27
1609*4882a593Smuzhiyun	vcipher		$out6,$out6,v27
1610*4882a593Smuzhiyun	vcipher		$out7,$out7,v27
1611*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
1612*4882a593Smuzhiyun
1613*4882a593Smuzhiyun	vcipher		$out0,$out0,v28
1614*4882a593Smuzhiyun	 lvx_u		$in0,$x00,$inp		# load input
1615*4882a593Smuzhiyun	vcipher		$out1,$out1,v28
1616*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
1617*4882a593Smuzhiyun	vcipher		$out2,$out2,v28
1618*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
1619*4882a593Smuzhiyun	vcipher		$out3,$out3,v28
1620*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
1621*4882a593Smuzhiyun	vcipher		$out4,$out4,v28
1622*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
1623*4882a593Smuzhiyun	vcipher		$out5,$out5,v28
1624*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
1625*4882a593Smuzhiyun	vcipher		$out6,$out6,v28
1626*4882a593Smuzhiyun	 lvx_u		$in6,$x60,$inp
1627*4882a593Smuzhiyun	vcipher		$out7,$out7,v28
1628*4882a593Smuzhiyun	 lvx_u		$in7,$x70,$inp
1629*4882a593Smuzhiyun	 addi		$inp,$inp,0x80
1630*4882a593Smuzhiyun
1631*4882a593Smuzhiyun	vcipher		$out0,$out0,v29
1632*4882a593Smuzhiyun	 le?vperm	$in0,$in0,$in0,$inpperm
1633*4882a593Smuzhiyun	vcipher		$out1,$out1,v29
1634*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$inpperm
1635*4882a593Smuzhiyun	vcipher		$out2,$out2,v29
1636*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$inpperm
1637*4882a593Smuzhiyun	vcipher		$out3,$out3,v29
1638*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$inpperm
1639*4882a593Smuzhiyun	vcipher		$out4,$out4,v29
1640*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$inpperm
1641*4882a593Smuzhiyun	vcipher		$out5,$out5,v29
1642*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$inpperm
1643*4882a593Smuzhiyun	vcipher		$out6,$out6,v29
1644*4882a593Smuzhiyun	 le?vperm	$in6,$in6,$in6,$inpperm
1645*4882a593Smuzhiyun	vcipher		$out7,$out7,v29
1646*4882a593Smuzhiyun	 le?vperm	$in7,$in7,$in7,$inpperm
1647*4882a593Smuzhiyun
1648*4882a593Smuzhiyun	add		$inp,$inp,r0		# $inp is adjusted in such
1649*4882a593Smuzhiyun						# way that at exit from the
1650*4882a593Smuzhiyun						# loop inX-in7 are loaded
1651*4882a593Smuzhiyun						# with last "words"
1652*4882a593Smuzhiyun	subfe.		r0,r0,r0		# borrow?-1:0
1653*4882a593Smuzhiyun	vcipher		$out0,$out0,v30
1654*4882a593Smuzhiyun	 vxor		$in0,$in0,v31		# xor with last round key
1655*4882a593Smuzhiyun	vcipher		$out1,$out1,v30
1656*4882a593Smuzhiyun	 vxor		$in1,$in1,v31
1657*4882a593Smuzhiyun	vcipher		$out2,$out2,v30
1658*4882a593Smuzhiyun	 vxor		$in2,$in2,v31
1659*4882a593Smuzhiyun	vcipher		$out3,$out3,v30
1660*4882a593Smuzhiyun	 vxor		$in3,$in3,v31
1661*4882a593Smuzhiyun	vcipher		$out4,$out4,v30
1662*4882a593Smuzhiyun	 vxor		$in4,$in4,v31
1663*4882a593Smuzhiyun	vcipher		$out5,$out5,v30
1664*4882a593Smuzhiyun	 vxor		$in5,$in5,v31
1665*4882a593Smuzhiyun	vcipher		$out6,$out6,v30
1666*4882a593Smuzhiyun	 vxor		$in6,$in6,v31
1667*4882a593Smuzhiyun	vcipher		$out7,$out7,v30
1668*4882a593Smuzhiyun	 vxor		$in7,$in7,v31
1669*4882a593Smuzhiyun
1670*4882a593Smuzhiyun	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1671*4882a593Smuzhiyun
1672*4882a593Smuzhiyun	vcipherlast	$in0,$out0,$in0
1673*4882a593Smuzhiyun	vcipherlast	$in1,$out1,$in1
1674*4882a593Smuzhiyun	 vadduqm	$out1,$ivec,$one	# counter values ...
1675*4882a593Smuzhiyun	vcipherlast	$in2,$out2,$in2
1676*4882a593Smuzhiyun	 vadduqm	$out2,$ivec,$two
1677*4882a593Smuzhiyun	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1678*4882a593Smuzhiyun	vcipherlast	$in3,$out3,$in3
1679*4882a593Smuzhiyun	 vadduqm	$out3,$out1,$two
1680*4882a593Smuzhiyun	 vxor		$out1,$out1,$rndkey0
1681*4882a593Smuzhiyun	vcipherlast	$in4,$out4,$in4
1682*4882a593Smuzhiyun	 vadduqm	$out4,$out2,$two
1683*4882a593Smuzhiyun	 vxor		$out2,$out2,$rndkey0
1684*4882a593Smuzhiyun	vcipherlast	$in5,$out5,$in5
1685*4882a593Smuzhiyun	 vadduqm	$out5,$out3,$two
1686*4882a593Smuzhiyun	 vxor		$out3,$out3,$rndkey0
1687*4882a593Smuzhiyun	vcipherlast	$in6,$out6,$in6
1688*4882a593Smuzhiyun	 vadduqm	$out6,$out4,$two
1689*4882a593Smuzhiyun	 vxor		$out4,$out4,$rndkey0
1690*4882a593Smuzhiyun	vcipherlast	$in7,$out7,$in7
1691*4882a593Smuzhiyun	 vadduqm	$out7,$out5,$two
1692*4882a593Smuzhiyun	 vxor		$out5,$out5,$rndkey0
1693*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$inpperm
1694*4882a593Smuzhiyun	 vadduqm	$ivec,$out6,$two	# next counter value
1695*4882a593Smuzhiyun	 vxor		$out6,$out6,$rndkey0
1696*4882a593Smuzhiyun	le?vperm	$in1,$in1,$in1,$inpperm
1697*4882a593Smuzhiyun	 vxor		$out7,$out7,$rndkey0
1698*4882a593Smuzhiyun	mtctr		$rounds
1699*4882a593Smuzhiyun
1700*4882a593Smuzhiyun	 vcipher	$out0,$out0,v24
1701*4882a593Smuzhiyun	stvx_u		$in0,$x00,$out
1702*4882a593Smuzhiyun	le?vperm	$in2,$in2,$in2,$inpperm
1703*4882a593Smuzhiyun	 vcipher	$out1,$out1,v24
1704*4882a593Smuzhiyun	stvx_u		$in1,$x10,$out
1705*4882a593Smuzhiyun	le?vperm	$in3,$in3,$in3,$inpperm
1706*4882a593Smuzhiyun	 vcipher	$out2,$out2,v24
1707*4882a593Smuzhiyun	stvx_u		$in2,$x20,$out
1708*4882a593Smuzhiyun	le?vperm	$in4,$in4,$in4,$inpperm
1709*4882a593Smuzhiyun	 vcipher	$out3,$out3,v24
1710*4882a593Smuzhiyun	stvx_u		$in3,$x30,$out
1711*4882a593Smuzhiyun	le?vperm	$in5,$in5,$in5,$inpperm
1712*4882a593Smuzhiyun	 vcipher	$out4,$out4,v24
1713*4882a593Smuzhiyun	stvx_u		$in4,$x40,$out
1714*4882a593Smuzhiyun	le?vperm	$in6,$in6,$in6,$inpperm
1715*4882a593Smuzhiyun	 vcipher	$out5,$out5,v24
1716*4882a593Smuzhiyun	stvx_u		$in5,$x50,$out
1717*4882a593Smuzhiyun	le?vperm	$in7,$in7,$in7,$inpperm
1718*4882a593Smuzhiyun	 vcipher	$out6,$out6,v24
1719*4882a593Smuzhiyun	stvx_u		$in6,$x60,$out
1720*4882a593Smuzhiyun	 vcipher	$out7,$out7,v24
1721*4882a593Smuzhiyun	stvx_u		$in7,$x70,$out
1722*4882a593Smuzhiyun	addi		$out,$out,0x80
1723*4882a593Smuzhiyun
1724*4882a593Smuzhiyun	b		Loop_ctr32_enc8x_middle
1725*4882a593Smuzhiyun
1726*4882a593Smuzhiyun.align	5
1727*4882a593SmuzhiyunLctr32_enc8x_break:
1728*4882a593Smuzhiyun	cmpwi		$len,-0x60
1729*4882a593Smuzhiyun	blt		Lctr32_enc8x_one
1730*4882a593Smuzhiyun	nop
1731*4882a593Smuzhiyun	beq		Lctr32_enc8x_two
1732*4882a593Smuzhiyun	cmpwi		$len,-0x40
1733*4882a593Smuzhiyun	blt		Lctr32_enc8x_three
1734*4882a593Smuzhiyun	nop
1735*4882a593Smuzhiyun	beq		Lctr32_enc8x_four
1736*4882a593Smuzhiyun	cmpwi		$len,-0x20
1737*4882a593Smuzhiyun	blt		Lctr32_enc8x_five
1738*4882a593Smuzhiyun	nop
1739*4882a593Smuzhiyun	beq		Lctr32_enc8x_six
1740*4882a593Smuzhiyun	cmpwi		$len,0x00
1741*4882a593Smuzhiyun	blt		Lctr32_enc8x_seven
1742*4882a593Smuzhiyun
1743*4882a593SmuzhiyunLctr32_enc8x_eight:
1744*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in0
1745*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in1
1746*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in2
1747*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in3
1748*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in4
1749*4882a593Smuzhiyun	vcipherlast	$out5,$out5,$in5
1750*4882a593Smuzhiyun	vcipherlast	$out6,$out6,$in6
1751*4882a593Smuzhiyun	vcipherlast	$out7,$out7,$in7
1752*4882a593Smuzhiyun
1753*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1754*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1755*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1756*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1757*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1758*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1759*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1760*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1761*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
1762*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1763*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
1764*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1765*4882a593Smuzhiyun	stvx_u		$out5,$x50,$out
1766*4882a593Smuzhiyun	le?vperm	$out7,$out7,$out7,$inpperm
1767*4882a593Smuzhiyun	stvx_u		$out6,$x60,$out
1768*4882a593Smuzhiyun	stvx_u		$out7,$x70,$out
1769*4882a593Smuzhiyun	addi		$out,$out,0x80
1770*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1771*4882a593Smuzhiyun
1772*4882a593Smuzhiyun.align	5
1773*4882a593SmuzhiyunLctr32_enc8x_seven:
1774*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in1
1775*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in2
1776*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in3
1777*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in4
1778*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in5
1779*4882a593Smuzhiyun	vcipherlast	$out5,$out5,$in6
1780*4882a593Smuzhiyun	vcipherlast	$out6,$out6,$in7
1781*4882a593Smuzhiyun
1782*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1783*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1784*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1785*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1786*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1787*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1788*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1789*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1790*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
1791*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1792*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
1793*4882a593Smuzhiyun	le?vperm	$out6,$out6,$out6,$inpperm
1794*4882a593Smuzhiyun	stvx_u		$out5,$x50,$out
1795*4882a593Smuzhiyun	stvx_u		$out6,$x60,$out
1796*4882a593Smuzhiyun	addi		$out,$out,0x70
1797*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1798*4882a593Smuzhiyun
1799*4882a593Smuzhiyun.align	5
1800*4882a593SmuzhiyunLctr32_enc8x_six:
1801*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in2
1802*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in3
1803*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in4
1804*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in5
1805*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in6
1806*4882a593Smuzhiyun	vcipherlast	$out5,$out5,$in7
1807*4882a593Smuzhiyun
1808*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1809*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1810*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1811*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1812*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1813*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1814*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1815*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1816*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
1817*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$inpperm
1818*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
1819*4882a593Smuzhiyun	stvx_u		$out5,$x50,$out
1820*4882a593Smuzhiyun	addi		$out,$out,0x60
1821*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1822*4882a593Smuzhiyun
1823*4882a593Smuzhiyun.align	5
1824*4882a593SmuzhiyunLctr32_enc8x_five:
1825*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in3
1826*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in4
1827*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in5
1828*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in6
1829*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in7
1830*4882a593Smuzhiyun
1831*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1832*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1833*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1834*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1835*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1836*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1837*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1838*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$inpperm
1839*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
1840*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
1841*4882a593Smuzhiyun	addi		$out,$out,0x50
1842*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1843*4882a593Smuzhiyun
1844*4882a593Smuzhiyun.align	5
1845*4882a593SmuzhiyunLctr32_enc8x_four:
1846*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in4
1847*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in5
1848*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in6
1849*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in7
1850*4882a593Smuzhiyun
1851*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1852*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1853*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1854*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1855*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1856*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$inpperm
1857*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1858*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
1859*4882a593Smuzhiyun	addi		$out,$out,0x40
1860*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1861*4882a593Smuzhiyun
1862*4882a593Smuzhiyun.align	5
1863*4882a593SmuzhiyunLctr32_enc8x_three:
1864*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in5
1865*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in6
1866*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in7
1867*4882a593Smuzhiyun
1868*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1869*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1870*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1871*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$inpperm
1872*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1873*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
1874*4882a593Smuzhiyun	addi		$out,$out,0x30
1875*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1876*4882a593Smuzhiyun
1877*4882a593Smuzhiyun.align	5
1878*4882a593SmuzhiyunLctr32_enc8x_two:
1879*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in6
1880*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in7
1881*4882a593Smuzhiyun
1882*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1883*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$inpperm
1884*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out
1885*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
1886*4882a593Smuzhiyun	addi		$out,$out,0x20
1887*4882a593Smuzhiyun	b		Lctr32_enc8x_done
1888*4882a593Smuzhiyun
1889*4882a593Smuzhiyun.align	5
1890*4882a593SmuzhiyunLctr32_enc8x_one:
1891*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in7
1892*4882a593Smuzhiyun
1893*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$inpperm
1894*4882a593Smuzhiyun	stvx_u		$out0,0,$out
1895*4882a593Smuzhiyun	addi		$out,$out,0x10
1896*4882a593Smuzhiyun
1897*4882a593SmuzhiyunLctr32_enc8x_done:
1898*4882a593Smuzhiyun	li		r10,`$FRAME+15`
1899*4882a593Smuzhiyun	li		r11,`$FRAME+31`
1900*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1901*4882a593Smuzhiyun	addi		r10,r10,32
1902*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1903*4882a593Smuzhiyun	addi		r11,r11,32
1904*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1905*4882a593Smuzhiyun	addi		r10,r10,32
1906*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1907*4882a593Smuzhiyun	addi		r11,r11,32
1908*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1909*4882a593Smuzhiyun	addi		r10,r10,32
1910*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1911*4882a593Smuzhiyun	addi		r11,r11,32
1912*4882a593Smuzhiyun	stvx		$inpperm,r10,$sp
1913*4882a593Smuzhiyun	addi		r10,r10,32
1914*4882a593Smuzhiyun	stvx		$inpperm,r11,$sp
1915*4882a593Smuzhiyun	addi		r11,r11,32
1916*4882a593Smuzhiyun
1917*4882a593Smuzhiyun	mtspr		256,$vrsave
1918*4882a593Smuzhiyun	lvx		v20,r10,$sp		# ABI says so
1919*4882a593Smuzhiyun	addi		r10,r10,32
1920*4882a593Smuzhiyun	lvx		v21,r11,$sp
1921*4882a593Smuzhiyun	addi		r11,r11,32
1922*4882a593Smuzhiyun	lvx		v22,r10,$sp
1923*4882a593Smuzhiyun	addi		r10,r10,32
1924*4882a593Smuzhiyun	lvx		v23,r11,$sp
1925*4882a593Smuzhiyun	addi		r11,r11,32
1926*4882a593Smuzhiyun	lvx		v24,r10,$sp
1927*4882a593Smuzhiyun	addi		r10,r10,32
1928*4882a593Smuzhiyun	lvx		v25,r11,$sp
1929*4882a593Smuzhiyun	addi		r11,r11,32
1930*4882a593Smuzhiyun	lvx		v26,r10,$sp
1931*4882a593Smuzhiyun	addi		r10,r10,32
1932*4882a593Smuzhiyun	lvx		v27,r11,$sp
1933*4882a593Smuzhiyun	addi		r11,r11,32
1934*4882a593Smuzhiyun	lvx		v28,r10,$sp
1935*4882a593Smuzhiyun	addi		r10,r10,32
1936*4882a593Smuzhiyun	lvx		v29,r11,$sp
1937*4882a593Smuzhiyun	addi		r11,r11,32
1938*4882a593Smuzhiyun	lvx		v30,r10,$sp
1939*4882a593Smuzhiyun	lvx		v31,r11,$sp
1940*4882a593Smuzhiyun	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1941*4882a593Smuzhiyun	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1942*4882a593Smuzhiyun	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1943*4882a593Smuzhiyun	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1944*4882a593Smuzhiyun	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1945*4882a593Smuzhiyun	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1946*4882a593Smuzhiyun	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1947*4882a593Smuzhiyun	blr
1948*4882a593Smuzhiyun	.long		0
1949*4882a593Smuzhiyun	.byte		0,12,0x14,0,0x80,6,6,0
1950*4882a593Smuzhiyun	.long		0
1951*4882a593Smuzhiyun.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1952*4882a593Smuzhiyun___
1953*4882a593Smuzhiyun}}	}}}
1954*4882a593Smuzhiyun
1955*4882a593Smuzhiyun#########################################################################
1956*4882a593Smuzhiyun{{{	# XTS procedures						#
1957*4882a593Smuzhiyun# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1958*4882a593Smuzhiyun#                             const AES_KEY *key1, const AES_KEY *key2,	#
1959*4882a593Smuzhiyun#                             [const] unsigned char iv[16]);		#
1960*4882a593Smuzhiyun# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1961*4882a593Smuzhiyun# input tweak value is assumed to be encrypted already, and last tweak	#
1962*4882a593Smuzhiyun# value, one suitable for consecutive call on same chunk of data, is	#
1963*4882a593Smuzhiyun# written back to original buffer. In addition, in "tweak chaining"	#
1964*4882a593Smuzhiyun# mode only complete input blocks are processed.			#
1965*4882a593Smuzhiyun
1966*4882a593Smuzhiyunmy ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1967*4882a593Smuzhiyunmy ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1968*4882a593Smuzhiyunmy ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1969*4882a593Smuzhiyunmy ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1970*4882a593Smuzhiyunmy $taillen = $key2;
1971*4882a593Smuzhiyun
1972*4882a593Smuzhiyun   ($inp,$idx) = ($idx,$inp);				# reassign
1973*4882a593Smuzhiyun
1974*4882a593Smuzhiyun$code.=<<___;
1975*4882a593Smuzhiyun.globl	.${prefix}_xts_encrypt
1976*4882a593Smuzhiyun	mr		$inp,r3				# reassign
1977*4882a593Smuzhiyun	li		r3,-1
1978*4882a593Smuzhiyun	${UCMP}i	$len,16
1979*4882a593Smuzhiyun	bltlr-
1980*4882a593Smuzhiyun
1981*4882a593Smuzhiyun	lis		r0,0xfff0
1982*4882a593Smuzhiyun	mfspr		r12,256				# save vrsave
1983*4882a593Smuzhiyun	li		r11,0
1984*4882a593Smuzhiyun	mtspr		256,r0
1985*4882a593Smuzhiyun
1986*4882a593Smuzhiyun	vspltisb	$seven,0x07			# 0x070707..07
1987*4882a593Smuzhiyun	le?lvsl		$leperm,r11,r11
1988*4882a593Smuzhiyun	le?vspltisb	$tmp,0x0f
1989*4882a593Smuzhiyun	le?vxor		$leperm,$leperm,$seven
1990*4882a593Smuzhiyun
1991*4882a593Smuzhiyun	li		$idx,15
1992*4882a593Smuzhiyun	lvx		$tweak,0,$ivp			# load [unaligned] iv
1993*4882a593Smuzhiyun	lvsl		$inpperm,0,$ivp
1994*4882a593Smuzhiyun	lvx		$inptail,$idx,$ivp
1995*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
1996*4882a593Smuzhiyun	vperm		$tweak,$tweak,$inptail,$inpperm
1997*4882a593Smuzhiyun
1998*4882a593Smuzhiyun	neg		r11,$inp
1999*4882a593Smuzhiyun	lvsr		$inpperm,0,r11			# prepare for unaligned load
2000*4882a593Smuzhiyun	lvx		$inout,0,$inp
2001*4882a593Smuzhiyun	addi		$inp,$inp,15			# 15 is not typo
2002*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
2003*4882a593Smuzhiyun
2004*4882a593Smuzhiyun	${UCMP}i	$key2,0				# key2==NULL?
2005*4882a593Smuzhiyun	beq		Lxts_enc_no_key2
2006*4882a593Smuzhiyun
2007*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2008*4882a593Smuzhiyun	lwz		$rounds,240($key2)
2009*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
2010*4882a593Smuzhiyun	subi		$rounds,$rounds,1
2011*4882a593Smuzhiyun	li		$idx,16
2012*4882a593Smuzhiyun
2013*4882a593Smuzhiyun	lvx		$rndkey0,0,$key2
2014*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2015*4882a593Smuzhiyun	addi		$idx,$idx,16
2016*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2017*4882a593Smuzhiyun	vxor		$tweak,$tweak,$rndkey0
2018*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key2
2019*4882a593Smuzhiyun	addi		$idx,$idx,16
2020*4882a593Smuzhiyun	mtctr		$rounds
2021*4882a593Smuzhiyun
2022*4882a593SmuzhiyunLtweak_xts_enc:
2023*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2024*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey1
2025*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2026*4882a593Smuzhiyun	addi		$idx,$idx,16
2027*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2028*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey0
2029*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key2
2030*4882a593Smuzhiyun	addi		$idx,$idx,16
2031*4882a593Smuzhiyun	bdnz		Ltweak_xts_enc
2032*4882a593Smuzhiyun
2033*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2034*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey1
2035*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2036*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2037*4882a593Smuzhiyun	vcipherlast	$tweak,$tweak,$rndkey0
2038*4882a593Smuzhiyun
2039*4882a593Smuzhiyun	li		$ivp,0				# don't chain the tweak
2040*4882a593Smuzhiyun	b		Lxts_enc
2041*4882a593Smuzhiyun
2042*4882a593SmuzhiyunLxts_enc_no_key2:
2043*4882a593Smuzhiyun	li		$idx,-16
2044*4882a593Smuzhiyun	and		$len,$len,$idx			# in "tweak chaining"
2045*4882a593Smuzhiyun							# mode only complete
2046*4882a593Smuzhiyun							# blocks are processed
2047*4882a593SmuzhiyunLxts_enc:
2048*4882a593Smuzhiyun	lvx		$inptail,0,$inp
2049*4882a593Smuzhiyun	addi		$inp,$inp,16
2050*4882a593Smuzhiyun
2051*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2052*4882a593Smuzhiyun	lwz		$rounds,240($key1)
2053*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
2054*4882a593Smuzhiyun	subi		$rounds,$rounds,1
2055*4882a593Smuzhiyun	li		$idx,16
2056*4882a593Smuzhiyun
2057*4882a593Smuzhiyun	vslb		$eighty7,$seven,$seven		# 0x808080..80
2058*4882a593Smuzhiyun	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2059*4882a593Smuzhiyun	vspltisb	$tmp,1				# 0x010101..01
2060*4882a593Smuzhiyun	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2061*4882a593Smuzhiyun
2062*4882a593Smuzhiyun	${UCMP}i	$len,96
2063*4882a593Smuzhiyun	bge		_aesp8_xts_encrypt6x
2064*4882a593Smuzhiyun
2065*4882a593Smuzhiyun	andi.		$taillen,$len,15
2066*4882a593Smuzhiyun	subic		r0,$len,32
2067*4882a593Smuzhiyun	subi		$taillen,$taillen,16
2068*4882a593Smuzhiyun	subfe		r0,r0,r0
2069*4882a593Smuzhiyun	and		r0,r0,$taillen
2070*4882a593Smuzhiyun	add		$inp,$inp,r0
2071*4882a593Smuzhiyun
2072*4882a593Smuzhiyun	lvx		$rndkey0,0,$key1
2073*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2074*4882a593Smuzhiyun	addi		$idx,$idx,16
2075*4882a593Smuzhiyun	vperm		$inout,$inout,$inptail,$inpperm
2076*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2077*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak
2078*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
2079*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2080*4882a593Smuzhiyun	addi		$idx,$idx,16
2081*4882a593Smuzhiyun	mtctr		$rounds
2082*4882a593Smuzhiyun	b		Loop_xts_enc
2083*4882a593Smuzhiyun
2084*4882a593Smuzhiyun.align	5
2085*4882a593SmuzhiyunLoop_xts_enc:
2086*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2087*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
2088*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2089*4882a593Smuzhiyun	addi		$idx,$idx,16
2090*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2091*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey0
2092*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2093*4882a593Smuzhiyun	addi		$idx,$idx,16
2094*4882a593Smuzhiyun	bdnz		Loop_xts_enc
2095*4882a593Smuzhiyun
2096*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2097*4882a593Smuzhiyun	vcipher		$inout,$inout,$rndkey1
2098*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2099*4882a593Smuzhiyun	li		$idx,16
2100*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2101*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$tweak
2102*4882a593Smuzhiyun	vcipherlast	$output,$inout,$rndkey0
2103*4882a593Smuzhiyun
2104*4882a593Smuzhiyun	le?vperm	$tmp,$output,$output,$leperm
2105*4882a593Smuzhiyun	be?nop
2106*4882a593Smuzhiyun	le?stvx_u	$tmp,0,$out
2107*4882a593Smuzhiyun	be?stvx_u	$output,0,$out
2108*4882a593Smuzhiyun	addi		$out,$out,16
2109*4882a593Smuzhiyun
2110*4882a593Smuzhiyun	subic.		$len,$len,16
2111*4882a593Smuzhiyun	beq		Lxts_enc_done
2112*4882a593Smuzhiyun
2113*4882a593Smuzhiyun	vmr		$inout,$inptail
2114*4882a593Smuzhiyun	lvx		$inptail,0,$inp
2115*4882a593Smuzhiyun	addi		$inp,$inp,16
2116*4882a593Smuzhiyun	lvx		$rndkey0,0,$key1
2117*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2118*4882a593Smuzhiyun	addi		$idx,$idx,16
2119*4882a593Smuzhiyun
2120*4882a593Smuzhiyun	subic		r0,$len,32
2121*4882a593Smuzhiyun	subfe		r0,r0,r0
2122*4882a593Smuzhiyun	and		r0,r0,$taillen
2123*4882a593Smuzhiyun	add		$inp,$inp,r0
2124*4882a593Smuzhiyun
2125*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven		# next tweak value
2126*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2127*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2128*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2129*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2130*4882a593Smuzhiyun
2131*4882a593Smuzhiyun	vperm		$inout,$inout,$inptail,$inpperm
2132*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2133*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak
2134*4882a593Smuzhiyun	vxor		$output,$output,$rndkey0	# just in case $len<16
2135*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
2136*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2137*4882a593Smuzhiyun	addi		$idx,$idx,16
2138*4882a593Smuzhiyun
2139*4882a593Smuzhiyun	mtctr		$rounds
2140*4882a593Smuzhiyun	${UCMP}i	$len,16
2141*4882a593Smuzhiyun	bge		Loop_xts_enc
2142*4882a593Smuzhiyun
2143*4882a593Smuzhiyun	vxor		$output,$output,$tweak
2144*4882a593Smuzhiyun	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2145*4882a593Smuzhiyun	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2146*4882a593Smuzhiyun	vspltisb	$tmp,-1
2147*4882a593Smuzhiyun	vperm		$inptail,$inptail,$tmp,$inpperm
2148*4882a593Smuzhiyun	vsel		$inout,$inout,$output,$inptail
2149*4882a593Smuzhiyun
2150*4882a593Smuzhiyun	subi		r11,$out,17
2151*4882a593Smuzhiyun	subi		$out,$out,16
2152*4882a593Smuzhiyun	mtctr		$len
2153*4882a593Smuzhiyun	li		$len,16
2154*4882a593SmuzhiyunLoop_xts_enc_steal:
2155*4882a593Smuzhiyun	lbzu		r0,1(r11)
2156*4882a593Smuzhiyun	stb		r0,16(r11)
2157*4882a593Smuzhiyun	bdnz		Loop_xts_enc_steal
2158*4882a593Smuzhiyun
2159*4882a593Smuzhiyun	mtctr		$rounds
2160*4882a593Smuzhiyun	b		Loop_xts_enc			# one more time...
2161*4882a593Smuzhiyun
2162*4882a593SmuzhiyunLxts_enc_done:
2163*4882a593Smuzhiyun	${UCMP}i	$ivp,0
2164*4882a593Smuzhiyun	beq		Lxts_enc_ret
2165*4882a593Smuzhiyun
2166*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven		# next tweak value
2167*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2168*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2169*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2170*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2171*4882a593Smuzhiyun
2172*4882a593Smuzhiyun	le?vperm	$tweak,$tweak,$tweak,$leperm
2173*4882a593Smuzhiyun	stvx_u		$tweak,0,$ivp
2174*4882a593Smuzhiyun
2175*4882a593SmuzhiyunLxts_enc_ret:
2176*4882a593Smuzhiyun	mtspr		256,r12				# restore vrsave
2177*4882a593Smuzhiyun	li		r3,0
2178*4882a593Smuzhiyun	blr
2179*4882a593Smuzhiyun	.long		0
2180*4882a593Smuzhiyun	.byte		0,12,0x04,0,0x80,6,6,0
2181*4882a593Smuzhiyun	.long		0
2182*4882a593Smuzhiyun.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2183*4882a593Smuzhiyun
2184*4882a593Smuzhiyun.globl	.${prefix}_xts_decrypt
2185*4882a593Smuzhiyun	mr		$inp,r3				# reassign
2186*4882a593Smuzhiyun	li		r3,-1
2187*4882a593Smuzhiyun	${UCMP}i	$len,16
2188*4882a593Smuzhiyun	bltlr-
2189*4882a593Smuzhiyun
2190*4882a593Smuzhiyun	lis		r0,0xfff8
2191*4882a593Smuzhiyun	mfspr		r12,256				# save vrsave
2192*4882a593Smuzhiyun	li		r11,0
2193*4882a593Smuzhiyun	mtspr		256,r0
2194*4882a593Smuzhiyun
2195*4882a593Smuzhiyun	andi.		r0,$len,15
2196*4882a593Smuzhiyun	neg		r0,r0
2197*4882a593Smuzhiyun	andi.		r0,r0,16
2198*4882a593Smuzhiyun	sub		$len,$len,r0
2199*4882a593Smuzhiyun
2200*4882a593Smuzhiyun	vspltisb	$seven,0x07			# 0x070707..07
2201*4882a593Smuzhiyun	le?lvsl		$leperm,r11,r11
2202*4882a593Smuzhiyun	le?vspltisb	$tmp,0x0f
2203*4882a593Smuzhiyun	le?vxor		$leperm,$leperm,$seven
2204*4882a593Smuzhiyun
2205*4882a593Smuzhiyun	li		$idx,15
2206*4882a593Smuzhiyun	lvx		$tweak,0,$ivp			# load [unaligned] iv
2207*4882a593Smuzhiyun	lvsl		$inpperm,0,$ivp
2208*4882a593Smuzhiyun	lvx		$inptail,$idx,$ivp
2209*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
2210*4882a593Smuzhiyun	vperm		$tweak,$tweak,$inptail,$inpperm
2211*4882a593Smuzhiyun
2212*4882a593Smuzhiyun	neg		r11,$inp
2213*4882a593Smuzhiyun	lvsr		$inpperm,0,r11			# prepare for unaligned load
2214*4882a593Smuzhiyun	lvx		$inout,0,$inp
2215*4882a593Smuzhiyun	addi		$inp,$inp,15			# 15 is not typo
2216*4882a593Smuzhiyun	le?vxor		$inpperm,$inpperm,$tmp
2217*4882a593Smuzhiyun
2218*4882a593Smuzhiyun	${UCMP}i	$key2,0				# key2==NULL?
2219*4882a593Smuzhiyun	beq		Lxts_dec_no_key2
2220*4882a593Smuzhiyun
2221*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2222*4882a593Smuzhiyun	lwz		$rounds,240($key2)
2223*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
2224*4882a593Smuzhiyun	subi		$rounds,$rounds,1
2225*4882a593Smuzhiyun	li		$idx,16
2226*4882a593Smuzhiyun
2227*4882a593Smuzhiyun	lvx		$rndkey0,0,$key2
2228*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2229*4882a593Smuzhiyun	addi		$idx,$idx,16
2230*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2231*4882a593Smuzhiyun	vxor		$tweak,$tweak,$rndkey0
2232*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key2
2233*4882a593Smuzhiyun	addi		$idx,$idx,16
2234*4882a593Smuzhiyun	mtctr		$rounds
2235*4882a593Smuzhiyun
2236*4882a593SmuzhiyunLtweak_xts_dec:
2237*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2238*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey1
2239*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2240*4882a593Smuzhiyun	addi		$idx,$idx,16
2241*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2242*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey0
2243*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key2
2244*4882a593Smuzhiyun	addi		$idx,$idx,16
2245*4882a593Smuzhiyun	bdnz		Ltweak_xts_dec
2246*4882a593Smuzhiyun
2247*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2248*4882a593Smuzhiyun	vcipher		$tweak,$tweak,$rndkey1
2249*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key2
2250*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2251*4882a593Smuzhiyun	vcipherlast	$tweak,$tweak,$rndkey0
2252*4882a593Smuzhiyun
2253*4882a593Smuzhiyun	li		$ivp,0				# don't chain the tweak
2254*4882a593Smuzhiyun	b		Lxts_dec
2255*4882a593Smuzhiyun
2256*4882a593SmuzhiyunLxts_dec_no_key2:
2257*4882a593Smuzhiyun	neg		$idx,$len
2258*4882a593Smuzhiyun	andi.		$idx,$idx,15
2259*4882a593Smuzhiyun	add		$len,$len,$idx			# in "tweak chaining"
2260*4882a593Smuzhiyun							# mode only complete
2261*4882a593Smuzhiyun							# blocks are processed
2262*4882a593SmuzhiyunLxts_dec:
2263*4882a593Smuzhiyun	lvx		$inptail,0,$inp
2264*4882a593Smuzhiyun	addi		$inp,$inp,16
2265*4882a593Smuzhiyun
2266*4882a593Smuzhiyun	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2267*4882a593Smuzhiyun	lwz		$rounds,240($key1)
2268*4882a593Smuzhiyun	srwi		$rounds,$rounds,1
2269*4882a593Smuzhiyun	subi		$rounds,$rounds,1
2270*4882a593Smuzhiyun	li		$idx,16
2271*4882a593Smuzhiyun
2272*4882a593Smuzhiyun	vslb		$eighty7,$seven,$seven		# 0x808080..80
2273*4882a593Smuzhiyun	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2274*4882a593Smuzhiyun	vspltisb	$tmp,1				# 0x010101..01
2275*4882a593Smuzhiyun	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2276*4882a593Smuzhiyun
2277*4882a593Smuzhiyun	${UCMP}i	$len,96
2278*4882a593Smuzhiyun	bge		_aesp8_xts_decrypt6x
2279*4882a593Smuzhiyun
2280*4882a593Smuzhiyun	lvx		$rndkey0,0,$key1
2281*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2282*4882a593Smuzhiyun	addi		$idx,$idx,16
2283*4882a593Smuzhiyun	vperm		$inout,$inout,$inptail,$inpperm
2284*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2285*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak
2286*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
2287*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2288*4882a593Smuzhiyun	addi		$idx,$idx,16
2289*4882a593Smuzhiyun	mtctr		$rounds
2290*4882a593Smuzhiyun
2291*4882a593Smuzhiyun	${UCMP}i	$len,16
2292*4882a593Smuzhiyun	blt		Ltail_xts_dec
2293*4882a593Smuzhiyun	be?b		Loop_xts_dec
2294*4882a593Smuzhiyun
2295*4882a593Smuzhiyun.align	5
2296*4882a593SmuzhiyunLoop_xts_dec:
2297*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2298*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
2299*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2300*4882a593Smuzhiyun	addi		$idx,$idx,16
2301*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2302*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey0
2303*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2304*4882a593Smuzhiyun	addi		$idx,$idx,16
2305*4882a593Smuzhiyun	bdnz		Loop_xts_dec
2306*4882a593Smuzhiyun
2307*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2308*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
2309*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2310*4882a593Smuzhiyun	li		$idx,16
2311*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2312*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$tweak
2313*4882a593Smuzhiyun	vncipherlast	$output,$inout,$rndkey0
2314*4882a593Smuzhiyun
2315*4882a593Smuzhiyun	le?vperm	$tmp,$output,$output,$leperm
2316*4882a593Smuzhiyun	be?nop
2317*4882a593Smuzhiyun	le?stvx_u	$tmp,0,$out
2318*4882a593Smuzhiyun	be?stvx_u	$output,0,$out
2319*4882a593Smuzhiyun	addi		$out,$out,16
2320*4882a593Smuzhiyun
2321*4882a593Smuzhiyun	subic.		$len,$len,16
2322*4882a593Smuzhiyun	beq		Lxts_dec_done
2323*4882a593Smuzhiyun
2324*4882a593Smuzhiyun	vmr		$inout,$inptail
2325*4882a593Smuzhiyun	lvx		$inptail,0,$inp
2326*4882a593Smuzhiyun	addi		$inp,$inp,16
2327*4882a593Smuzhiyun	lvx		$rndkey0,0,$key1
2328*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2329*4882a593Smuzhiyun	addi		$idx,$idx,16
2330*4882a593Smuzhiyun
2331*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven		# next tweak value
2332*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2333*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2334*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2335*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2336*4882a593Smuzhiyun
2337*4882a593Smuzhiyun	vperm		$inout,$inout,$inptail,$inpperm
2338*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2339*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak
2340*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
2341*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2342*4882a593Smuzhiyun	addi		$idx,$idx,16
2343*4882a593Smuzhiyun
2344*4882a593Smuzhiyun	mtctr		$rounds
2345*4882a593Smuzhiyun	${UCMP}i	$len,16
2346*4882a593Smuzhiyun	bge		Loop_xts_dec
2347*4882a593Smuzhiyun
2348*4882a593SmuzhiyunLtail_xts_dec:
2349*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven		# next tweak value
2350*4882a593Smuzhiyun	vaddubm		$tweak1,$tweak,$tweak
2351*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2352*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2353*4882a593Smuzhiyun	vxor		$tweak1,$tweak1,$tmp
2354*4882a593Smuzhiyun
2355*4882a593Smuzhiyun	subi		$inp,$inp,16
2356*4882a593Smuzhiyun	add		$inp,$inp,$len
2357*4882a593Smuzhiyun
2358*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak		# :-(
2359*4882a593Smuzhiyun	vxor		$inout,$inout,$tweak1		# :-)
2360*4882a593Smuzhiyun
2361*4882a593SmuzhiyunLoop_xts_dec_short:
2362*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2363*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
2364*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2365*4882a593Smuzhiyun	addi		$idx,$idx,16
2366*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2367*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey0
2368*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2369*4882a593Smuzhiyun	addi		$idx,$idx,16
2370*4882a593Smuzhiyun	bdnz		Loop_xts_dec_short
2371*4882a593Smuzhiyun
2372*4882a593Smuzhiyun	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2373*4882a593Smuzhiyun	vncipher	$inout,$inout,$rndkey1
2374*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2375*4882a593Smuzhiyun	li		$idx,16
2376*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2377*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$tweak1
2378*4882a593Smuzhiyun	vncipherlast	$output,$inout,$rndkey0
2379*4882a593Smuzhiyun
2380*4882a593Smuzhiyun	le?vperm	$tmp,$output,$output,$leperm
2381*4882a593Smuzhiyun	be?nop
2382*4882a593Smuzhiyun	le?stvx_u	$tmp,0,$out
2383*4882a593Smuzhiyun	be?stvx_u	$output,0,$out
2384*4882a593Smuzhiyun
2385*4882a593Smuzhiyun	vmr		$inout,$inptail
2386*4882a593Smuzhiyun	lvx		$inptail,0,$inp
2387*4882a593Smuzhiyun	#addi		$inp,$inp,16
2388*4882a593Smuzhiyun	lvx		$rndkey0,0,$key1
2389*4882a593Smuzhiyun	lvx		$rndkey1,$idx,$key1
2390*4882a593Smuzhiyun	addi		$idx,$idx,16
2391*4882a593Smuzhiyun	vperm		$inout,$inout,$inptail,$inpperm
2392*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2393*4882a593Smuzhiyun
2394*4882a593Smuzhiyun	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2395*4882a593Smuzhiyun	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2396*4882a593Smuzhiyun	vspltisb	$tmp,-1
2397*4882a593Smuzhiyun	vperm		$inptail,$inptail,$tmp,$inpperm
2398*4882a593Smuzhiyun	vsel		$inout,$inout,$output,$inptail
2399*4882a593Smuzhiyun
2400*4882a593Smuzhiyun	vxor		$rndkey0,$rndkey0,$tweak
2401*4882a593Smuzhiyun	vxor		$inout,$inout,$rndkey0
2402*4882a593Smuzhiyun	lvx		$rndkey0,$idx,$key1
2403*4882a593Smuzhiyun	addi		$idx,$idx,16
2404*4882a593Smuzhiyun
2405*4882a593Smuzhiyun	subi		r11,$out,1
2406*4882a593Smuzhiyun	mtctr		$len
2407*4882a593Smuzhiyun	li		$len,16
2408*4882a593SmuzhiyunLoop_xts_dec_steal:
2409*4882a593Smuzhiyun	lbzu		r0,1(r11)
2410*4882a593Smuzhiyun	stb		r0,16(r11)
2411*4882a593Smuzhiyun	bdnz		Loop_xts_dec_steal
2412*4882a593Smuzhiyun
2413*4882a593Smuzhiyun	mtctr		$rounds
2414*4882a593Smuzhiyun	b		Loop_xts_dec			# one more time...
2415*4882a593Smuzhiyun
2416*4882a593SmuzhiyunLxts_dec_done:
2417*4882a593Smuzhiyun	${UCMP}i	$ivp,0
2418*4882a593Smuzhiyun	beq		Lxts_dec_ret
2419*4882a593Smuzhiyun
2420*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven		# next tweak value
2421*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2422*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2423*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2424*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2425*4882a593Smuzhiyun
2426*4882a593Smuzhiyun	le?vperm	$tweak,$tweak,$tweak,$leperm
2427*4882a593Smuzhiyun	stvx_u		$tweak,0,$ivp
2428*4882a593Smuzhiyun
2429*4882a593SmuzhiyunLxts_dec_ret:
2430*4882a593Smuzhiyun	mtspr		256,r12				# restore vrsave
2431*4882a593Smuzhiyun	li		r3,0
2432*4882a593Smuzhiyun	blr
2433*4882a593Smuzhiyun	.long		0
2434*4882a593Smuzhiyun	.byte		0,12,0x04,0,0x80,6,6,0
2435*4882a593Smuzhiyun	.long		0
2436*4882a593Smuzhiyun.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2437*4882a593Smuzhiyun___
2438*4882a593Smuzhiyun#########################################################################
2439*4882a593Smuzhiyun{{	# Optimized XTS procedures					#
2440*4882a593Smuzhiyunmy $key_=$key2;
2441*4882a593Smuzhiyunmy ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2442*4882a593Smuzhiyun    $x00=0 if ($flavour =~ /osx/);
2443*4882a593Smuzhiyunmy ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2444*4882a593Smuzhiyunmy ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2445*4882a593Smuzhiyunmy ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2446*4882a593Smuzhiyunmy $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2447*4882a593Smuzhiyun			# v26-v31 last 6 round keys
2448*4882a593Smuzhiyunmy ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2449*4882a593Smuzhiyunmy $taillen=$x70;
2450*4882a593Smuzhiyun
2451*4882a593Smuzhiyun$code.=<<___;
2452*4882a593Smuzhiyun.align	5
2453*4882a593Smuzhiyun_aesp8_xts_encrypt6x:
2454*4882a593Smuzhiyun	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2455*4882a593Smuzhiyun	mflr		r11
2456*4882a593Smuzhiyun	li		r7,`$FRAME+8*16+15`
2457*4882a593Smuzhiyun	li		r3,`$FRAME+8*16+31`
2458*4882a593Smuzhiyun	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2459*4882a593Smuzhiyun	stvx		v20,r7,$sp		# ABI says so
2460*4882a593Smuzhiyun	addi		r7,r7,32
2461*4882a593Smuzhiyun	stvx		v21,r3,$sp
2462*4882a593Smuzhiyun	addi		r3,r3,32
2463*4882a593Smuzhiyun	stvx		v22,r7,$sp
2464*4882a593Smuzhiyun	addi		r7,r7,32
2465*4882a593Smuzhiyun	stvx		v23,r3,$sp
2466*4882a593Smuzhiyun	addi		r3,r3,32
2467*4882a593Smuzhiyun	stvx		v24,r7,$sp
2468*4882a593Smuzhiyun	addi		r7,r7,32
2469*4882a593Smuzhiyun	stvx		v25,r3,$sp
2470*4882a593Smuzhiyun	addi		r3,r3,32
2471*4882a593Smuzhiyun	stvx		v26,r7,$sp
2472*4882a593Smuzhiyun	addi		r7,r7,32
2473*4882a593Smuzhiyun	stvx		v27,r3,$sp
2474*4882a593Smuzhiyun	addi		r3,r3,32
2475*4882a593Smuzhiyun	stvx		v28,r7,$sp
2476*4882a593Smuzhiyun	addi		r7,r7,32
2477*4882a593Smuzhiyun	stvx		v29,r3,$sp
2478*4882a593Smuzhiyun	addi		r3,r3,32
2479*4882a593Smuzhiyun	stvx		v30,r7,$sp
2480*4882a593Smuzhiyun	stvx		v31,r3,$sp
2481*4882a593Smuzhiyun	li		r0,-1
2482*4882a593Smuzhiyun	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2483*4882a593Smuzhiyun	li		$x10,0x10
2484*4882a593Smuzhiyun	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2485*4882a593Smuzhiyun	li		$x20,0x20
2486*4882a593Smuzhiyun	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2487*4882a593Smuzhiyun	li		$x30,0x30
2488*4882a593Smuzhiyun	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2489*4882a593Smuzhiyun	li		$x40,0x40
2490*4882a593Smuzhiyun	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2491*4882a593Smuzhiyun	li		$x50,0x50
2492*4882a593Smuzhiyun	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2493*4882a593Smuzhiyun	li		$x60,0x60
2494*4882a593Smuzhiyun	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2495*4882a593Smuzhiyun	li		$x70,0x70
2496*4882a593Smuzhiyun	mtspr		256,r0
2497*4882a593Smuzhiyun
2498*4882a593Smuzhiyun	subi		$rounds,$rounds,3	# -4 in total
2499*4882a593Smuzhiyun
2500*4882a593Smuzhiyun	lvx		$rndkey0,$x00,$key1	# load key schedule
2501*4882a593Smuzhiyun	lvx		v30,$x10,$key1
2502*4882a593Smuzhiyun	addi		$key1,$key1,0x20
2503*4882a593Smuzhiyun	lvx		v31,$x00,$key1
2504*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2505*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15
2506*4882a593Smuzhiyun	mtctr		$rounds
2507*4882a593Smuzhiyun
2508*4882a593SmuzhiyunLoad_xts_enc_key:
2509*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
2510*4882a593Smuzhiyun	lvx		v30,$x10,$key1
2511*4882a593Smuzhiyun	addi		$key1,$key1,0x20
2512*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[1]
2513*4882a593Smuzhiyun	?vperm		v25,v31,v30,$keyperm
2514*4882a593Smuzhiyun	lvx		v31,$x00,$key1
2515*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[2]
2516*4882a593Smuzhiyun	addi		$key_,$key_,0x20
2517*4882a593Smuzhiyun	bdnz		Load_xts_enc_key
2518*4882a593Smuzhiyun
2519*4882a593Smuzhiyun	lvx		v26,$x10,$key1
2520*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
2521*4882a593Smuzhiyun	lvx		v27,$x20,$key1
2522*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[3]
2523*4882a593Smuzhiyun	?vperm		v25,v31,v26,$keyperm
2524*4882a593Smuzhiyun	lvx		v28,$x30,$key1
2525*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[4]
2526*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
2527*4882a593Smuzhiyun	?vperm		v26,v26,v27,$keyperm
2528*4882a593Smuzhiyun	lvx		v29,$x40,$key1
2529*4882a593Smuzhiyun	?vperm		v27,v27,v28,$keyperm
2530*4882a593Smuzhiyun	lvx		v30,$x50,$key1
2531*4882a593Smuzhiyun	?vperm		v28,v28,v29,$keyperm
2532*4882a593Smuzhiyun	lvx		v31,$x60,$key1
2533*4882a593Smuzhiyun	?vperm		v29,v29,v30,$keyperm
2534*4882a593Smuzhiyun	lvx		$twk5,$x70,$key1	# borrow $twk5
2535*4882a593Smuzhiyun	?vperm		v30,v30,v31,$keyperm
2536*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# pre-load round[1]
2537*4882a593Smuzhiyun	?vperm		v31,v31,$twk5,$keyperm
2538*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# pre-load round[2]
2539*4882a593Smuzhiyun
2540*4882a593Smuzhiyun	 vperm		$in0,$inout,$inptail,$inpperm
2541*4882a593Smuzhiyun	 subi		$inp,$inp,31		# undo "caller"
2542*4882a593Smuzhiyun	vxor		$twk0,$tweak,$rndkey0
2543*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2544*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2545*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2546*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2547*4882a593Smuzhiyun	 vxor		$out0,$in0,$twk0
2548*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2549*4882a593Smuzhiyun
2550*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
2551*4882a593Smuzhiyun	vxor		$twk1,$tweak,$rndkey0
2552*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2553*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2554*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2555*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$leperm
2556*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2557*4882a593Smuzhiyun	 vxor		$out1,$in1,$twk1
2558*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2559*4882a593Smuzhiyun
2560*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
2561*4882a593Smuzhiyun	 andi.		$taillen,$len,15
2562*4882a593Smuzhiyun	vxor		$twk2,$tweak,$rndkey0
2563*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2564*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2565*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2566*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$leperm
2567*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2568*4882a593Smuzhiyun	 vxor		$out2,$in2,$twk2
2569*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2570*4882a593Smuzhiyun
2571*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
2572*4882a593Smuzhiyun	 sub		$len,$len,$taillen
2573*4882a593Smuzhiyun	vxor		$twk3,$tweak,$rndkey0
2574*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2575*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2576*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2577*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$leperm
2578*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2579*4882a593Smuzhiyun	 vxor		$out3,$in3,$twk3
2580*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2581*4882a593Smuzhiyun
2582*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
2583*4882a593Smuzhiyun	 subi		$len,$len,0x60
2584*4882a593Smuzhiyun	vxor		$twk4,$tweak,$rndkey0
2585*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2586*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2587*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2588*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$leperm
2589*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2590*4882a593Smuzhiyun	 vxor		$out4,$in4,$twk4
2591*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2592*4882a593Smuzhiyun
2593*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
2594*4882a593Smuzhiyun	 addi		$inp,$inp,0x60
2595*4882a593Smuzhiyun	vxor		$twk5,$tweak,$rndkey0
2596*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
2597*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
2598*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
2599*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$leperm
2600*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
2601*4882a593Smuzhiyun	 vxor		$out5,$in5,$twk5
2602*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
2603*4882a593Smuzhiyun
2604*4882a593Smuzhiyun	vxor		v31,v31,$rndkey0
2605*4882a593Smuzhiyun	mtctr		$rounds
2606*4882a593Smuzhiyun	b		Loop_xts_enc6x
2607*4882a593Smuzhiyun
2608*4882a593Smuzhiyun.align	5
2609*4882a593SmuzhiyunLoop_xts_enc6x:
2610*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
2611*4882a593Smuzhiyun	vcipher		$out1,$out1,v24
2612*4882a593Smuzhiyun	vcipher		$out2,$out2,v24
2613*4882a593Smuzhiyun	vcipher		$out3,$out3,v24
2614*4882a593Smuzhiyun	vcipher		$out4,$out4,v24
2615*4882a593Smuzhiyun	vcipher		$out5,$out5,v24
2616*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
2617*4882a593Smuzhiyun	addi		$key_,$key_,0x20
2618*4882a593Smuzhiyun
2619*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
2620*4882a593Smuzhiyun	vcipher		$out1,$out1,v25
2621*4882a593Smuzhiyun	vcipher		$out2,$out2,v25
2622*4882a593Smuzhiyun	vcipher		$out3,$out3,v25
2623*4882a593Smuzhiyun	vcipher		$out4,$out4,v25
2624*4882a593Smuzhiyun	vcipher		$out5,$out5,v25
2625*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
2626*4882a593Smuzhiyun	bdnz		Loop_xts_enc6x
2627*4882a593Smuzhiyun
2628*4882a593Smuzhiyun	subic		$len,$len,96		# $len-=96
2629*4882a593Smuzhiyun	 vxor		$in0,$twk0,v31		# xor with last round key
2630*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
2631*4882a593Smuzhiyun	vcipher		$out1,$out1,v24
2632*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2633*4882a593Smuzhiyun	 vxor		$twk0,$tweak,$rndkey0
2634*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2635*4882a593Smuzhiyun	vcipher		$out2,$out2,v24
2636*4882a593Smuzhiyun	vcipher		$out3,$out3,v24
2637*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2638*4882a593Smuzhiyun	vcipher		$out4,$out4,v24
2639*4882a593Smuzhiyun	vcipher		$out5,$out5,v24
2640*4882a593Smuzhiyun
2641*4882a593Smuzhiyun	subfe.		r0,r0,r0		# borrow?-1:0
2642*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2643*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
2644*4882a593Smuzhiyun	vcipher		$out1,$out1,v25
2645*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2646*4882a593Smuzhiyun	vcipher		$out2,$out2,v25
2647*4882a593Smuzhiyun	vcipher		$out3,$out3,v25
2648*4882a593Smuzhiyun	 vxor		$in1,$twk1,v31
2649*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2650*4882a593Smuzhiyun	 vxor		$twk1,$tweak,$rndkey0
2651*4882a593Smuzhiyun	vcipher		$out4,$out4,v25
2652*4882a593Smuzhiyun	vcipher		$out5,$out5,v25
2653*4882a593Smuzhiyun
2654*4882a593Smuzhiyun	and		r0,r0,$len
2655*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2656*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2657*4882a593Smuzhiyun	vcipher		$out0,$out0,v26
2658*4882a593Smuzhiyun	vcipher		$out1,$out1,v26
2659*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2660*4882a593Smuzhiyun	vcipher		$out2,$out2,v26
2661*4882a593Smuzhiyun	vcipher		$out3,$out3,v26
2662*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2663*4882a593Smuzhiyun	vcipher		$out4,$out4,v26
2664*4882a593Smuzhiyun	vcipher		$out5,$out5,v26
2665*4882a593Smuzhiyun
2666*4882a593Smuzhiyun	add		$inp,$inp,r0		# $inp is adjusted in such
2667*4882a593Smuzhiyun						# way that at exit from the
2668*4882a593Smuzhiyun						# loop inX-in5 are loaded
2669*4882a593Smuzhiyun						# with last "words"
2670*4882a593Smuzhiyun	 vxor		$in2,$twk2,v31
2671*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2672*4882a593Smuzhiyun	 vxor		$twk2,$tweak,$rndkey0
2673*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2674*4882a593Smuzhiyun	vcipher		$out0,$out0,v27
2675*4882a593Smuzhiyun	vcipher		$out1,$out1,v27
2676*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2677*4882a593Smuzhiyun	vcipher		$out2,$out2,v27
2678*4882a593Smuzhiyun	vcipher		$out3,$out3,v27
2679*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2680*4882a593Smuzhiyun	vcipher		$out4,$out4,v27
2681*4882a593Smuzhiyun	vcipher		$out5,$out5,v27
2682*4882a593Smuzhiyun
2683*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
2684*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2685*4882a593Smuzhiyun	vcipher		$out0,$out0,v28
2686*4882a593Smuzhiyun	vcipher		$out1,$out1,v28
2687*4882a593Smuzhiyun	 vxor		$in3,$twk3,v31
2688*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2689*4882a593Smuzhiyun	 vxor		$twk3,$tweak,$rndkey0
2690*4882a593Smuzhiyun	vcipher		$out2,$out2,v28
2691*4882a593Smuzhiyun	vcipher		$out3,$out3,v28
2692*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2693*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2694*4882a593Smuzhiyun	vcipher		$out4,$out4,v28
2695*4882a593Smuzhiyun	vcipher		$out5,$out5,v28
2696*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
2697*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2698*4882a593Smuzhiyun
2699*4882a593Smuzhiyun	vcipher		$out0,$out0,v29
2700*4882a593Smuzhiyun	vcipher		$out1,$out1,v29
2701*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2702*4882a593Smuzhiyun	vcipher		$out2,$out2,v29
2703*4882a593Smuzhiyun	vcipher		$out3,$out3,v29
2704*4882a593Smuzhiyun	 vxor		$in4,$twk4,v31
2705*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2706*4882a593Smuzhiyun	 vxor		$twk4,$tweak,$rndkey0
2707*4882a593Smuzhiyun	vcipher		$out4,$out4,v29
2708*4882a593Smuzhiyun	vcipher		$out5,$out5,v29
2709*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
2710*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2711*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2712*4882a593Smuzhiyun
2713*4882a593Smuzhiyun	vcipher		$out0,$out0,v30
2714*4882a593Smuzhiyun	vcipher		$out1,$out1,v30
2715*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2716*4882a593Smuzhiyun	vcipher		$out2,$out2,v30
2717*4882a593Smuzhiyun	vcipher		$out3,$out3,v30
2718*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2719*4882a593Smuzhiyun	vcipher		$out4,$out4,v30
2720*4882a593Smuzhiyun	vcipher		$out5,$out5,v30
2721*4882a593Smuzhiyun	 vxor		$in5,$twk5,v31
2722*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
2723*4882a593Smuzhiyun	 vxor		$twk5,$tweak,$rndkey0
2724*4882a593Smuzhiyun
2725*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$in0
2726*4882a593Smuzhiyun	 lvx_u		$in0,$x00,$inp		# load next input block
2727*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
2728*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
2729*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in1
2730*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
2731*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in2
2732*4882a593Smuzhiyun	 le?vperm	$in0,$in0,$in0,$leperm
2733*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
2734*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
2735*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in3
2736*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$leperm
2737*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
2738*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in4
2739*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$leperm
2740*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
2741*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
2742*4882a593Smuzhiyun	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2743*4882a593Smuzhiyun						# in stealing mode
2744*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$leperm
2745*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
2746*4882a593Smuzhiyun	 addi		$inp,$inp,0x60
2747*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$leperm
2748*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$leperm
2749*4882a593Smuzhiyun
2750*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2751*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
2752*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2753*4882a593Smuzhiyun	 vxor		$out0,$in0,$twk0
2754*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
2755*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
2756*4882a593Smuzhiyun	 vxor		$out1,$in1,$twk1
2757*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
2758*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
2759*4882a593Smuzhiyun	 vxor		$out2,$in2,$twk2
2760*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$leperm
2761*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
2762*4882a593Smuzhiyun	 vxor		$out3,$in3,$twk3
2763*4882a593Smuzhiyun	le?vperm	$out5,$tmp,$tmp,$leperm
2764*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
2765*4882a593Smuzhiyun	 vxor		$out4,$in4,$twk4
2766*4882a593Smuzhiyun	le?stvx_u	$out5,$x50,$out
2767*4882a593Smuzhiyun	be?stvx_u	$tmp, $x50,$out
2768*4882a593Smuzhiyun	 vxor		$out5,$in5,$twk5
2769*4882a593Smuzhiyun	addi		$out,$out,0x60
2770*4882a593Smuzhiyun
2771*4882a593Smuzhiyun	mtctr		$rounds
2772*4882a593Smuzhiyun	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2773*4882a593Smuzhiyun
2774*4882a593Smuzhiyun	addic.		$len,$len,0x60
2775*4882a593Smuzhiyun	beq		Lxts_enc6x_zero
2776*4882a593Smuzhiyun	cmpwi		$len,0x20
2777*4882a593Smuzhiyun	blt		Lxts_enc6x_one
2778*4882a593Smuzhiyun	nop
2779*4882a593Smuzhiyun	beq		Lxts_enc6x_two
2780*4882a593Smuzhiyun	cmpwi		$len,0x40
2781*4882a593Smuzhiyun	blt		Lxts_enc6x_three
2782*4882a593Smuzhiyun	nop
2783*4882a593Smuzhiyun	beq		Lxts_enc6x_four
2784*4882a593Smuzhiyun
2785*4882a593SmuzhiyunLxts_enc6x_five:
2786*4882a593Smuzhiyun	vxor		$out0,$in1,$twk0
2787*4882a593Smuzhiyun	vxor		$out1,$in2,$twk1
2788*4882a593Smuzhiyun	vxor		$out2,$in3,$twk2
2789*4882a593Smuzhiyun	vxor		$out3,$in4,$twk3
2790*4882a593Smuzhiyun	vxor		$out4,$in5,$twk4
2791*4882a593Smuzhiyun
2792*4882a593Smuzhiyun	bl		_aesp8_xts_enc5x
2793*4882a593Smuzhiyun
2794*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2795*4882a593Smuzhiyun	vmr		$twk0,$twk5		# unused tweak
2796*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
2797*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2798*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
2799*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
2800*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
2801*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
2802*4882a593Smuzhiyun	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2803*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$leperm
2804*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
2805*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
2806*4882a593Smuzhiyun	addi		$out,$out,0x50
2807*4882a593Smuzhiyun	bne		Lxts_enc6x_steal
2808*4882a593Smuzhiyun	b		Lxts_enc6x_done
2809*4882a593Smuzhiyun
2810*4882a593Smuzhiyun.align	4
2811*4882a593SmuzhiyunLxts_enc6x_four:
2812*4882a593Smuzhiyun	vxor		$out0,$in2,$twk0
2813*4882a593Smuzhiyun	vxor		$out1,$in3,$twk1
2814*4882a593Smuzhiyun	vxor		$out2,$in4,$twk2
2815*4882a593Smuzhiyun	vxor		$out3,$in5,$twk3
2816*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
2817*4882a593Smuzhiyun
2818*4882a593Smuzhiyun	bl		_aesp8_xts_enc5x
2819*4882a593Smuzhiyun
2820*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2821*4882a593Smuzhiyun	vmr		$twk0,$twk4		# unused tweak
2822*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
2823*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2824*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
2825*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
2826*4882a593Smuzhiyun	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2827*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
2828*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
2829*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
2830*4882a593Smuzhiyun	addi		$out,$out,0x40
2831*4882a593Smuzhiyun	bne		Lxts_enc6x_steal
2832*4882a593Smuzhiyun	b		Lxts_enc6x_done
2833*4882a593Smuzhiyun
2834*4882a593Smuzhiyun.align	4
2835*4882a593SmuzhiyunLxts_enc6x_three:
2836*4882a593Smuzhiyun	vxor		$out0,$in3,$twk0
2837*4882a593Smuzhiyun	vxor		$out1,$in4,$twk1
2838*4882a593Smuzhiyun	vxor		$out2,$in5,$twk2
2839*4882a593Smuzhiyun	vxor		$out3,$out3,$out3
2840*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
2841*4882a593Smuzhiyun
2842*4882a593Smuzhiyun	bl		_aesp8_xts_enc5x
2843*4882a593Smuzhiyun
2844*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2845*4882a593Smuzhiyun	vmr		$twk0,$twk3		# unused tweak
2846*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
2847*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2848*4882a593Smuzhiyun	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2849*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
2850*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
2851*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
2852*4882a593Smuzhiyun	addi		$out,$out,0x30
2853*4882a593Smuzhiyun	bne		Lxts_enc6x_steal
2854*4882a593Smuzhiyun	b		Lxts_enc6x_done
2855*4882a593Smuzhiyun
2856*4882a593Smuzhiyun.align	4
2857*4882a593SmuzhiyunLxts_enc6x_two:
2858*4882a593Smuzhiyun	vxor		$out0,$in4,$twk0
2859*4882a593Smuzhiyun	vxor		$out1,$in5,$twk1
2860*4882a593Smuzhiyun	vxor		$out2,$out2,$out2
2861*4882a593Smuzhiyun	vxor		$out3,$out3,$out3
2862*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
2863*4882a593Smuzhiyun
2864*4882a593Smuzhiyun	bl		_aesp8_xts_enc5x
2865*4882a593Smuzhiyun
2866*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2867*4882a593Smuzhiyun	vmr		$twk0,$twk2		# unused tweak
2868*4882a593Smuzhiyun	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2869*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
2870*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2871*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
2872*4882a593Smuzhiyun	addi		$out,$out,0x20
2873*4882a593Smuzhiyun	bne		Lxts_enc6x_steal
2874*4882a593Smuzhiyun	b		Lxts_enc6x_done
2875*4882a593Smuzhiyun
2876*4882a593Smuzhiyun.align	4
2877*4882a593SmuzhiyunLxts_enc6x_one:
2878*4882a593Smuzhiyun	vxor		$out0,$in5,$twk0
2879*4882a593Smuzhiyun	nop
2880*4882a593SmuzhiyunLoop_xts_enc1x:
2881*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
2882*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
2883*4882a593Smuzhiyun	addi		$key_,$key_,0x20
2884*4882a593Smuzhiyun
2885*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
2886*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
2887*4882a593Smuzhiyun	bdnz		Loop_xts_enc1x
2888*4882a593Smuzhiyun
2889*4882a593Smuzhiyun	add		$inp,$inp,$taillen
2890*4882a593Smuzhiyun	cmpwi		$taillen,0
2891*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
2892*4882a593Smuzhiyun
2893*4882a593Smuzhiyun	subi		$inp,$inp,16
2894*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
2895*4882a593Smuzhiyun
2896*4882a593Smuzhiyun	lvsr		$inpperm,0,$taillen
2897*4882a593Smuzhiyun	vcipher		$out0,$out0,v26
2898*4882a593Smuzhiyun
2899*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
2900*4882a593Smuzhiyun	vcipher		$out0,$out0,v27
2901*4882a593Smuzhiyun
2902*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
2903*4882a593Smuzhiyun	vcipher		$out0,$out0,v28
2904*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
2905*4882a593Smuzhiyun
2906*4882a593Smuzhiyun	vcipher		$out0,$out0,v29
2907*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
2908*4882a593Smuzhiyun	 vxor		$twk0,$twk0,v31
2909*4882a593Smuzhiyun
2910*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
2911*4882a593Smuzhiyun	vcipher		$out0,$out0,v30
2912*4882a593Smuzhiyun
2913*4882a593Smuzhiyun	vperm		$in0,$in0,$in0,$inpperm
2914*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$twk0
2915*4882a593Smuzhiyun
2916*4882a593Smuzhiyun	vmr		$twk0,$twk1		# unused tweak
2917*4882a593Smuzhiyun	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2918*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
2919*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
2920*4882a593Smuzhiyun	addi		$out,$out,0x10
2921*4882a593Smuzhiyun	bne		Lxts_enc6x_steal
2922*4882a593Smuzhiyun	b		Lxts_enc6x_done
2923*4882a593Smuzhiyun
2924*4882a593Smuzhiyun.align	4
2925*4882a593SmuzhiyunLxts_enc6x_zero:
2926*4882a593Smuzhiyun	cmpwi		$taillen,0
2927*4882a593Smuzhiyun	beq		Lxts_enc6x_done
2928*4882a593Smuzhiyun
2929*4882a593Smuzhiyun	add		$inp,$inp,$taillen
2930*4882a593Smuzhiyun	subi		$inp,$inp,16
2931*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
2932*4882a593Smuzhiyun	lvsr		$inpperm,0,$taillen	# $in5 is no more
2933*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
2934*4882a593Smuzhiyun	vperm		$in0,$in0,$in0,$inpperm
2935*4882a593Smuzhiyun	vxor		$tmp,$tmp,$twk0
2936*4882a593SmuzhiyunLxts_enc6x_steal:
2937*4882a593Smuzhiyun	vxor		$in0,$in0,$twk0
2938*4882a593Smuzhiyun	vxor		$out0,$out0,$out0
2939*4882a593Smuzhiyun	vspltisb	$out1,-1
2940*4882a593Smuzhiyun	vperm		$out0,$out0,$out1,$inpperm
2941*4882a593Smuzhiyun	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2942*4882a593Smuzhiyun
2943*4882a593Smuzhiyun	subi		r30,$out,17
2944*4882a593Smuzhiyun	subi		$out,$out,16
2945*4882a593Smuzhiyun	mtctr		$taillen
2946*4882a593SmuzhiyunLoop_xts_enc6x_steal:
2947*4882a593Smuzhiyun	lbzu		r0,1(r30)
2948*4882a593Smuzhiyun	stb		r0,16(r30)
2949*4882a593Smuzhiyun	bdnz		Loop_xts_enc6x_steal
2950*4882a593Smuzhiyun
2951*4882a593Smuzhiyun	li		$taillen,0
2952*4882a593Smuzhiyun	mtctr		$rounds
2953*4882a593Smuzhiyun	b		Loop_xts_enc1x		# one more time...
2954*4882a593Smuzhiyun
2955*4882a593Smuzhiyun.align	4
2956*4882a593SmuzhiyunLxts_enc6x_done:
2957*4882a593Smuzhiyun	${UCMP}i	$ivp,0
2958*4882a593Smuzhiyun	beq		Lxts_enc6x_ret
2959*4882a593Smuzhiyun
2960*4882a593Smuzhiyun	vxor		$tweak,$twk0,$rndkey0
2961*4882a593Smuzhiyun	le?vperm	$tweak,$tweak,$tweak,$leperm
2962*4882a593Smuzhiyun	stvx_u		$tweak,0,$ivp
2963*4882a593Smuzhiyun
2964*4882a593SmuzhiyunLxts_enc6x_ret:
2965*4882a593Smuzhiyun	mtlr		r11
2966*4882a593Smuzhiyun	li		r10,`$FRAME+15`
2967*4882a593Smuzhiyun	li		r11,`$FRAME+31`
2968*4882a593Smuzhiyun	stvx		$seven,r10,$sp		# wipe copies of round keys
2969*4882a593Smuzhiyun	addi		r10,r10,32
2970*4882a593Smuzhiyun	stvx		$seven,r11,$sp
2971*4882a593Smuzhiyun	addi		r11,r11,32
2972*4882a593Smuzhiyun	stvx		$seven,r10,$sp
2973*4882a593Smuzhiyun	addi		r10,r10,32
2974*4882a593Smuzhiyun	stvx		$seven,r11,$sp
2975*4882a593Smuzhiyun	addi		r11,r11,32
2976*4882a593Smuzhiyun	stvx		$seven,r10,$sp
2977*4882a593Smuzhiyun	addi		r10,r10,32
2978*4882a593Smuzhiyun	stvx		$seven,r11,$sp
2979*4882a593Smuzhiyun	addi		r11,r11,32
2980*4882a593Smuzhiyun	stvx		$seven,r10,$sp
2981*4882a593Smuzhiyun	addi		r10,r10,32
2982*4882a593Smuzhiyun	stvx		$seven,r11,$sp
2983*4882a593Smuzhiyun	addi		r11,r11,32
2984*4882a593Smuzhiyun
2985*4882a593Smuzhiyun	mtspr		256,$vrsave
2986*4882a593Smuzhiyun	lvx		v20,r10,$sp		# ABI says so
2987*4882a593Smuzhiyun	addi		r10,r10,32
2988*4882a593Smuzhiyun	lvx		v21,r11,$sp
2989*4882a593Smuzhiyun	addi		r11,r11,32
2990*4882a593Smuzhiyun	lvx		v22,r10,$sp
2991*4882a593Smuzhiyun	addi		r10,r10,32
2992*4882a593Smuzhiyun	lvx		v23,r11,$sp
2993*4882a593Smuzhiyun	addi		r11,r11,32
2994*4882a593Smuzhiyun	lvx		v24,r10,$sp
2995*4882a593Smuzhiyun	addi		r10,r10,32
2996*4882a593Smuzhiyun	lvx		v25,r11,$sp
2997*4882a593Smuzhiyun	addi		r11,r11,32
2998*4882a593Smuzhiyun	lvx		v26,r10,$sp
2999*4882a593Smuzhiyun	addi		r10,r10,32
3000*4882a593Smuzhiyun	lvx		v27,r11,$sp
3001*4882a593Smuzhiyun	addi		r11,r11,32
3002*4882a593Smuzhiyun	lvx		v28,r10,$sp
3003*4882a593Smuzhiyun	addi		r10,r10,32
3004*4882a593Smuzhiyun	lvx		v29,r11,$sp
3005*4882a593Smuzhiyun	addi		r11,r11,32
3006*4882a593Smuzhiyun	lvx		v30,r10,$sp
3007*4882a593Smuzhiyun	lvx		v31,r11,$sp
3008*4882a593Smuzhiyun	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3009*4882a593Smuzhiyun	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3010*4882a593Smuzhiyun	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3011*4882a593Smuzhiyun	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3012*4882a593Smuzhiyun	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3013*4882a593Smuzhiyun	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3014*4882a593Smuzhiyun	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3015*4882a593Smuzhiyun	blr
3016*4882a593Smuzhiyun	.long		0
3017*4882a593Smuzhiyun	.byte		0,12,0x04,1,0x80,6,6,0
3018*4882a593Smuzhiyun	.long		0
3019*4882a593Smuzhiyun
3020*4882a593Smuzhiyun.align	5
3021*4882a593Smuzhiyun_aesp8_xts_enc5x:
3022*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
3023*4882a593Smuzhiyun	vcipher		$out1,$out1,v24
3024*4882a593Smuzhiyun	vcipher		$out2,$out2,v24
3025*4882a593Smuzhiyun	vcipher		$out3,$out3,v24
3026*4882a593Smuzhiyun	vcipher		$out4,$out4,v24
3027*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
3028*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3029*4882a593Smuzhiyun
3030*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
3031*4882a593Smuzhiyun	vcipher		$out1,$out1,v25
3032*4882a593Smuzhiyun	vcipher		$out2,$out2,v25
3033*4882a593Smuzhiyun	vcipher		$out3,$out3,v25
3034*4882a593Smuzhiyun	vcipher		$out4,$out4,v25
3035*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
3036*4882a593Smuzhiyun	bdnz		_aesp8_xts_enc5x
3037*4882a593Smuzhiyun
3038*4882a593Smuzhiyun	add		$inp,$inp,$taillen
3039*4882a593Smuzhiyun	cmpwi		$taillen,0
3040*4882a593Smuzhiyun	vcipher		$out0,$out0,v24
3041*4882a593Smuzhiyun	vcipher		$out1,$out1,v24
3042*4882a593Smuzhiyun	vcipher		$out2,$out2,v24
3043*4882a593Smuzhiyun	vcipher		$out3,$out3,v24
3044*4882a593Smuzhiyun	vcipher		$out4,$out4,v24
3045*4882a593Smuzhiyun
3046*4882a593Smuzhiyun	subi		$inp,$inp,16
3047*4882a593Smuzhiyun	vcipher		$out0,$out0,v25
3048*4882a593Smuzhiyun	vcipher		$out1,$out1,v25
3049*4882a593Smuzhiyun	vcipher		$out2,$out2,v25
3050*4882a593Smuzhiyun	vcipher		$out3,$out3,v25
3051*4882a593Smuzhiyun	vcipher		$out4,$out4,v25
3052*4882a593Smuzhiyun	 vxor		$twk0,$twk0,v31
3053*4882a593Smuzhiyun
3054*4882a593Smuzhiyun	vcipher		$out0,$out0,v26
3055*4882a593Smuzhiyun	lvsr		$inpperm,r0,$taillen	# $in5 is no more
3056*4882a593Smuzhiyun	vcipher		$out1,$out1,v26
3057*4882a593Smuzhiyun	vcipher		$out2,$out2,v26
3058*4882a593Smuzhiyun	vcipher		$out3,$out3,v26
3059*4882a593Smuzhiyun	vcipher		$out4,$out4,v26
3060*4882a593Smuzhiyun	 vxor		$in1,$twk1,v31
3061*4882a593Smuzhiyun
3062*4882a593Smuzhiyun	vcipher		$out0,$out0,v27
3063*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
3064*4882a593Smuzhiyun	vcipher		$out1,$out1,v27
3065*4882a593Smuzhiyun	vcipher		$out2,$out2,v27
3066*4882a593Smuzhiyun	vcipher		$out3,$out3,v27
3067*4882a593Smuzhiyun	vcipher		$out4,$out4,v27
3068*4882a593Smuzhiyun	 vxor		$in2,$twk2,v31
3069*4882a593Smuzhiyun
3070*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3071*4882a593Smuzhiyun	vcipher		$out0,$out0,v28
3072*4882a593Smuzhiyun	vcipher		$out1,$out1,v28
3073*4882a593Smuzhiyun	vcipher		$out2,$out2,v28
3074*4882a593Smuzhiyun	vcipher		$out3,$out3,v28
3075*4882a593Smuzhiyun	vcipher		$out4,$out4,v28
3076*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
3077*4882a593Smuzhiyun	 vxor		$in3,$twk3,v31
3078*4882a593Smuzhiyun
3079*4882a593Smuzhiyun	vcipher		$out0,$out0,v29
3080*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
3081*4882a593Smuzhiyun	vcipher		$out1,$out1,v29
3082*4882a593Smuzhiyun	vcipher		$out2,$out2,v29
3083*4882a593Smuzhiyun	vcipher		$out3,$out3,v29
3084*4882a593Smuzhiyun	vcipher		$out4,$out4,v29
3085*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
3086*4882a593Smuzhiyun	 vxor		$in4,$twk4,v31
3087*4882a593Smuzhiyun
3088*4882a593Smuzhiyun	vcipher		$out0,$out0,v30
3089*4882a593Smuzhiyun	vperm		$in0,$in0,$in0,$inpperm
3090*4882a593Smuzhiyun	vcipher		$out1,$out1,v30
3091*4882a593Smuzhiyun	vcipher		$out2,$out2,v30
3092*4882a593Smuzhiyun	vcipher		$out3,$out3,v30
3093*4882a593Smuzhiyun	vcipher		$out4,$out4,v30
3094*4882a593Smuzhiyun
3095*4882a593Smuzhiyun	vcipherlast	$out0,$out0,$twk0
3096*4882a593Smuzhiyun	vcipherlast	$out1,$out1,$in1
3097*4882a593Smuzhiyun	vcipherlast	$out2,$out2,$in2
3098*4882a593Smuzhiyun	vcipherlast	$out3,$out3,$in3
3099*4882a593Smuzhiyun	vcipherlast	$out4,$out4,$in4
3100*4882a593Smuzhiyun	blr
3101*4882a593Smuzhiyun        .long   	0
3102*4882a593Smuzhiyun        .byte   	0,12,0x14,0,0,0,0,0
3103*4882a593Smuzhiyun
3104*4882a593Smuzhiyun.align	5
3105*4882a593Smuzhiyun_aesp8_xts_decrypt6x:
3106*4882a593Smuzhiyun	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3107*4882a593Smuzhiyun	mflr		r11
3108*4882a593Smuzhiyun	li		r7,`$FRAME+8*16+15`
3109*4882a593Smuzhiyun	li		r3,`$FRAME+8*16+31`
3110*4882a593Smuzhiyun	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3111*4882a593Smuzhiyun	stvx		v20,r7,$sp		# ABI says so
3112*4882a593Smuzhiyun	addi		r7,r7,32
3113*4882a593Smuzhiyun	stvx		v21,r3,$sp
3114*4882a593Smuzhiyun	addi		r3,r3,32
3115*4882a593Smuzhiyun	stvx		v22,r7,$sp
3116*4882a593Smuzhiyun	addi		r7,r7,32
3117*4882a593Smuzhiyun	stvx		v23,r3,$sp
3118*4882a593Smuzhiyun	addi		r3,r3,32
3119*4882a593Smuzhiyun	stvx		v24,r7,$sp
3120*4882a593Smuzhiyun	addi		r7,r7,32
3121*4882a593Smuzhiyun	stvx		v25,r3,$sp
3122*4882a593Smuzhiyun	addi		r3,r3,32
3123*4882a593Smuzhiyun	stvx		v26,r7,$sp
3124*4882a593Smuzhiyun	addi		r7,r7,32
3125*4882a593Smuzhiyun	stvx		v27,r3,$sp
3126*4882a593Smuzhiyun	addi		r3,r3,32
3127*4882a593Smuzhiyun	stvx		v28,r7,$sp
3128*4882a593Smuzhiyun	addi		r7,r7,32
3129*4882a593Smuzhiyun	stvx		v29,r3,$sp
3130*4882a593Smuzhiyun	addi		r3,r3,32
3131*4882a593Smuzhiyun	stvx		v30,r7,$sp
3132*4882a593Smuzhiyun	stvx		v31,r3,$sp
3133*4882a593Smuzhiyun	li		r0,-1
3134*4882a593Smuzhiyun	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3135*4882a593Smuzhiyun	li		$x10,0x10
3136*4882a593Smuzhiyun	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3137*4882a593Smuzhiyun	li		$x20,0x20
3138*4882a593Smuzhiyun	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3139*4882a593Smuzhiyun	li		$x30,0x30
3140*4882a593Smuzhiyun	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3141*4882a593Smuzhiyun	li		$x40,0x40
3142*4882a593Smuzhiyun	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3143*4882a593Smuzhiyun	li		$x50,0x50
3144*4882a593Smuzhiyun	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3145*4882a593Smuzhiyun	li		$x60,0x60
3146*4882a593Smuzhiyun	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3147*4882a593Smuzhiyun	li		$x70,0x70
3148*4882a593Smuzhiyun	mtspr		256,r0
3149*4882a593Smuzhiyun
3150*4882a593Smuzhiyun	subi		$rounds,$rounds,3	# -4 in total
3151*4882a593Smuzhiyun
3152*4882a593Smuzhiyun	lvx		$rndkey0,$x00,$key1	# load key schedule
3153*4882a593Smuzhiyun	lvx		v30,$x10,$key1
3154*4882a593Smuzhiyun	addi		$key1,$key1,0x20
3155*4882a593Smuzhiyun	lvx		v31,$x00,$key1
3156*4882a593Smuzhiyun	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3157*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15
3158*4882a593Smuzhiyun	mtctr		$rounds
3159*4882a593Smuzhiyun
3160*4882a593SmuzhiyunLoad_xts_dec_key:
3161*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
3162*4882a593Smuzhiyun	lvx		v30,$x10,$key1
3163*4882a593Smuzhiyun	addi		$key1,$key1,0x20
3164*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[1]
3165*4882a593Smuzhiyun	?vperm		v25,v31,v30,$keyperm
3166*4882a593Smuzhiyun	lvx		v31,$x00,$key1
3167*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[2]
3168*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3169*4882a593Smuzhiyun	bdnz		Load_xts_dec_key
3170*4882a593Smuzhiyun
3171*4882a593Smuzhiyun	lvx		v26,$x10,$key1
3172*4882a593Smuzhiyun	?vperm		v24,v30,v31,$keyperm
3173*4882a593Smuzhiyun	lvx		v27,$x20,$key1
3174*4882a593Smuzhiyun	stvx		v24,$x00,$key_		# off-load round[3]
3175*4882a593Smuzhiyun	?vperm		v25,v31,v26,$keyperm
3176*4882a593Smuzhiyun	lvx		v28,$x30,$key1
3177*4882a593Smuzhiyun	stvx		v25,$x10,$key_		# off-load round[4]
3178*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3179*4882a593Smuzhiyun	?vperm		v26,v26,v27,$keyperm
3180*4882a593Smuzhiyun	lvx		v29,$x40,$key1
3181*4882a593Smuzhiyun	?vperm		v27,v27,v28,$keyperm
3182*4882a593Smuzhiyun	lvx		v30,$x50,$key1
3183*4882a593Smuzhiyun	?vperm		v28,v28,v29,$keyperm
3184*4882a593Smuzhiyun	lvx		v31,$x60,$key1
3185*4882a593Smuzhiyun	?vperm		v29,v29,v30,$keyperm
3186*4882a593Smuzhiyun	lvx		$twk5,$x70,$key1	# borrow $twk5
3187*4882a593Smuzhiyun	?vperm		v30,v30,v31,$keyperm
3188*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# pre-load round[1]
3189*4882a593Smuzhiyun	?vperm		v31,v31,$twk5,$keyperm
3190*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# pre-load round[2]
3191*4882a593Smuzhiyun
3192*4882a593Smuzhiyun	 vperm		$in0,$inout,$inptail,$inpperm
3193*4882a593Smuzhiyun	 subi		$inp,$inp,31		# undo "caller"
3194*4882a593Smuzhiyun	vxor		$twk0,$tweak,$rndkey0
3195*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3196*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3197*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3198*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3199*4882a593Smuzhiyun	 vxor		$out0,$in0,$twk0
3200*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3201*4882a593Smuzhiyun
3202*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
3203*4882a593Smuzhiyun	vxor		$twk1,$tweak,$rndkey0
3204*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3205*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3206*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3207*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$leperm
3208*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3209*4882a593Smuzhiyun	 vxor		$out1,$in1,$twk1
3210*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3211*4882a593Smuzhiyun
3212*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
3213*4882a593Smuzhiyun	 andi.		$taillen,$len,15
3214*4882a593Smuzhiyun	vxor		$twk2,$tweak,$rndkey0
3215*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3216*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3217*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3218*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$leperm
3219*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3220*4882a593Smuzhiyun	 vxor		$out2,$in2,$twk2
3221*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3222*4882a593Smuzhiyun
3223*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
3224*4882a593Smuzhiyun	 sub		$len,$len,$taillen
3225*4882a593Smuzhiyun	vxor		$twk3,$tweak,$rndkey0
3226*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3227*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3228*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3229*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$leperm
3230*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3231*4882a593Smuzhiyun	 vxor		$out3,$in3,$twk3
3232*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3233*4882a593Smuzhiyun
3234*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
3235*4882a593Smuzhiyun	 subi		$len,$len,0x60
3236*4882a593Smuzhiyun	vxor		$twk4,$tweak,$rndkey0
3237*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3238*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3239*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3240*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$leperm
3241*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3242*4882a593Smuzhiyun	 vxor		$out4,$in4,$twk4
3243*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3244*4882a593Smuzhiyun
3245*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
3246*4882a593Smuzhiyun	 addi		$inp,$inp,0x60
3247*4882a593Smuzhiyun	vxor		$twk5,$tweak,$rndkey0
3248*4882a593Smuzhiyun	vsrab		$tmp,$tweak,$seven	# next tweak value
3249*4882a593Smuzhiyun	vaddubm		$tweak,$tweak,$tweak
3250*4882a593Smuzhiyun	vsldoi		$tmp,$tmp,$tmp,15
3251*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$leperm
3252*4882a593Smuzhiyun	vand		$tmp,$tmp,$eighty7
3253*4882a593Smuzhiyun	 vxor		$out5,$in5,$twk5
3254*4882a593Smuzhiyun	vxor		$tweak,$tweak,$tmp
3255*4882a593Smuzhiyun
3256*4882a593Smuzhiyun	vxor		v31,v31,$rndkey0
3257*4882a593Smuzhiyun	mtctr		$rounds
3258*4882a593Smuzhiyun	b		Loop_xts_dec6x
3259*4882a593Smuzhiyun
3260*4882a593Smuzhiyun.align	5
3261*4882a593SmuzhiyunLoop_xts_dec6x:
3262*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3263*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
3264*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
3265*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
3266*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
3267*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
3268*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
3269*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3270*4882a593Smuzhiyun
3271*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3272*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
3273*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
3274*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
3275*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
3276*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
3277*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
3278*4882a593Smuzhiyun	bdnz		Loop_xts_dec6x
3279*4882a593Smuzhiyun
3280*4882a593Smuzhiyun	subic		$len,$len,96		# $len-=96
3281*4882a593Smuzhiyun	 vxor		$in0,$twk0,v31		# xor with last round key
3282*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3283*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
3284*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3285*4882a593Smuzhiyun	 vxor		$twk0,$tweak,$rndkey0
3286*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3287*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
3288*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
3289*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3290*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
3291*4882a593Smuzhiyun	vncipher	$out5,$out5,v24
3292*4882a593Smuzhiyun
3293*4882a593Smuzhiyun	subfe.		r0,r0,r0		# borrow?-1:0
3294*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3295*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3296*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
3297*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3298*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
3299*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
3300*4882a593Smuzhiyun	 vxor		$in1,$twk1,v31
3301*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3302*4882a593Smuzhiyun	 vxor		$twk1,$tweak,$rndkey0
3303*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
3304*4882a593Smuzhiyun	vncipher	$out5,$out5,v25
3305*4882a593Smuzhiyun
3306*4882a593Smuzhiyun	and		r0,r0,$len
3307*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3308*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3309*4882a593Smuzhiyun	vncipher	$out0,$out0,v26
3310*4882a593Smuzhiyun	vncipher	$out1,$out1,v26
3311*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3312*4882a593Smuzhiyun	vncipher	$out2,$out2,v26
3313*4882a593Smuzhiyun	vncipher	$out3,$out3,v26
3314*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3315*4882a593Smuzhiyun	vncipher	$out4,$out4,v26
3316*4882a593Smuzhiyun	vncipher	$out5,$out5,v26
3317*4882a593Smuzhiyun
3318*4882a593Smuzhiyun	add		$inp,$inp,r0		# $inp is adjusted in such
3319*4882a593Smuzhiyun						# way that at exit from the
3320*4882a593Smuzhiyun						# loop inX-in5 are loaded
3321*4882a593Smuzhiyun						# with last "words"
3322*4882a593Smuzhiyun	 vxor		$in2,$twk2,v31
3323*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3324*4882a593Smuzhiyun	 vxor		$twk2,$tweak,$rndkey0
3325*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3326*4882a593Smuzhiyun	vncipher	$out0,$out0,v27
3327*4882a593Smuzhiyun	vncipher	$out1,$out1,v27
3328*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3329*4882a593Smuzhiyun	vncipher	$out2,$out2,v27
3330*4882a593Smuzhiyun	vncipher	$out3,$out3,v27
3331*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3332*4882a593Smuzhiyun	vncipher	$out4,$out4,v27
3333*4882a593Smuzhiyun	vncipher	$out5,$out5,v27
3334*4882a593Smuzhiyun
3335*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3336*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3337*4882a593Smuzhiyun	vncipher	$out0,$out0,v28
3338*4882a593Smuzhiyun	vncipher	$out1,$out1,v28
3339*4882a593Smuzhiyun	 vxor		$in3,$twk3,v31
3340*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3341*4882a593Smuzhiyun	 vxor		$twk3,$tweak,$rndkey0
3342*4882a593Smuzhiyun	vncipher	$out2,$out2,v28
3343*4882a593Smuzhiyun	vncipher	$out3,$out3,v28
3344*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3345*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3346*4882a593Smuzhiyun	vncipher	$out4,$out4,v28
3347*4882a593Smuzhiyun	vncipher	$out5,$out5,v28
3348*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
3349*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3350*4882a593Smuzhiyun
3351*4882a593Smuzhiyun	vncipher	$out0,$out0,v29
3352*4882a593Smuzhiyun	vncipher	$out1,$out1,v29
3353*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3354*4882a593Smuzhiyun	vncipher	$out2,$out2,v29
3355*4882a593Smuzhiyun	vncipher	$out3,$out3,v29
3356*4882a593Smuzhiyun	 vxor		$in4,$twk4,v31
3357*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3358*4882a593Smuzhiyun	 vxor		$twk4,$tweak,$rndkey0
3359*4882a593Smuzhiyun	vncipher	$out4,$out4,v29
3360*4882a593Smuzhiyun	vncipher	$out5,$out5,v29
3361*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
3362*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3363*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3364*4882a593Smuzhiyun
3365*4882a593Smuzhiyun	vncipher	$out0,$out0,v30
3366*4882a593Smuzhiyun	vncipher	$out1,$out1,v30
3367*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3368*4882a593Smuzhiyun	vncipher	$out2,$out2,v30
3369*4882a593Smuzhiyun	vncipher	$out3,$out3,v30
3370*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3371*4882a593Smuzhiyun	vncipher	$out4,$out4,v30
3372*4882a593Smuzhiyun	vncipher	$out5,$out5,v30
3373*4882a593Smuzhiyun	 vxor		$in5,$twk5,v31
3374*4882a593Smuzhiyun	 vsrab		$tmp,$tweak,$seven	# next tweak value
3375*4882a593Smuzhiyun	 vxor		$twk5,$tweak,$rndkey0
3376*4882a593Smuzhiyun
3377*4882a593Smuzhiyun	vncipherlast	$out0,$out0,$in0
3378*4882a593Smuzhiyun	 lvx_u		$in0,$x00,$inp		# load next input block
3379*4882a593Smuzhiyun	 vaddubm	$tweak,$tweak,$tweak
3380*4882a593Smuzhiyun	 vsldoi		$tmp,$tmp,$tmp,15
3381*4882a593Smuzhiyun	vncipherlast	$out1,$out1,$in1
3382*4882a593Smuzhiyun	 lvx_u		$in1,$x10,$inp
3383*4882a593Smuzhiyun	vncipherlast	$out2,$out2,$in2
3384*4882a593Smuzhiyun	 le?vperm	$in0,$in0,$in0,$leperm
3385*4882a593Smuzhiyun	 lvx_u		$in2,$x20,$inp
3386*4882a593Smuzhiyun	 vand		$tmp,$tmp,$eighty7
3387*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$in3
3388*4882a593Smuzhiyun	 le?vperm	$in1,$in1,$in1,$leperm
3389*4882a593Smuzhiyun	 lvx_u		$in3,$x30,$inp
3390*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in4
3391*4882a593Smuzhiyun	 le?vperm	$in2,$in2,$in2,$leperm
3392*4882a593Smuzhiyun	 lvx_u		$in4,$x40,$inp
3393*4882a593Smuzhiyun	 vxor		$tweak,$tweak,$tmp
3394*4882a593Smuzhiyun	vncipherlast	$out5,$out5,$in5
3395*4882a593Smuzhiyun	 le?vperm	$in3,$in3,$in3,$leperm
3396*4882a593Smuzhiyun	 lvx_u		$in5,$x50,$inp
3397*4882a593Smuzhiyun	 addi		$inp,$inp,0x60
3398*4882a593Smuzhiyun	 le?vperm	$in4,$in4,$in4,$leperm
3399*4882a593Smuzhiyun	 le?vperm	$in5,$in5,$in5,$leperm
3400*4882a593Smuzhiyun
3401*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3402*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
3403*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3404*4882a593Smuzhiyun	 vxor		$out0,$in0,$twk0
3405*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
3406*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
3407*4882a593Smuzhiyun	 vxor		$out1,$in1,$twk1
3408*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
3409*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
3410*4882a593Smuzhiyun	 vxor		$out2,$in2,$twk2
3411*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$leperm
3412*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
3413*4882a593Smuzhiyun	 vxor		$out3,$in3,$twk3
3414*4882a593Smuzhiyun	le?vperm	$out5,$out5,$out5,$leperm
3415*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
3416*4882a593Smuzhiyun	 vxor		$out4,$in4,$twk4
3417*4882a593Smuzhiyun	stvx_u		$out5,$x50,$out
3418*4882a593Smuzhiyun	 vxor		$out5,$in5,$twk5
3419*4882a593Smuzhiyun	addi		$out,$out,0x60
3420*4882a593Smuzhiyun
3421*4882a593Smuzhiyun	mtctr		$rounds
3422*4882a593Smuzhiyun	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3423*4882a593Smuzhiyun
3424*4882a593Smuzhiyun	addic.		$len,$len,0x60
3425*4882a593Smuzhiyun	beq		Lxts_dec6x_zero
3426*4882a593Smuzhiyun	cmpwi		$len,0x20
3427*4882a593Smuzhiyun	blt		Lxts_dec6x_one
3428*4882a593Smuzhiyun	nop
3429*4882a593Smuzhiyun	beq		Lxts_dec6x_two
3430*4882a593Smuzhiyun	cmpwi		$len,0x40
3431*4882a593Smuzhiyun	blt		Lxts_dec6x_three
3432*4882a593Smuzhiyun	nop
3433*4882a593Smuzhiyun	beq		Lxts_dec6x_four
3434*4882a593Smuzhiyun
3435*4882a593SmuzhiyunLxts_dec6x_five:
3436*4882a593Smuzhiyun	vxor		$out0,$in1,$twk0
3437*4882a593Smuzhiyun	vxor		$out1,$in2,$twk1
3438*4882a593Smuzhiyun	vxor		$out2,$in3,$twk2
3439*4882a593Smuzhiyun	vxor		$out3,$in4,$twk3
3440*4882a593Smuzhiyun	vxor		$out4,$in5,$twk4
3441*4882a593Smuzhiyun
3442*4882a593Smuzhiyun	bl		_aesp8_xts_dec5x
3443*4882a593Smuzhiyun
3444*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3445*4882a593Smuzhiyun	vmr		$twk0,$twk5		# unused tweak
3446*4882a593Smuzhiyun	vxor		$twk1,$tweak,$rndkey0
3447*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
3448*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3449*4882a593Smuzhiyun	vxor		$out0,$in0,$twk1
3450*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
3451*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
3452*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
3453*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
3454*4882a593Smuzhiyun	le?vperm	$out4,$out4,$out4,$leperm
3455*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
3456*4882a593Smuzhiyun	stvx_u		$out4,$x40,$out
3457*4882a593Smuzhiyun	addi		$out,$out,0x50
3458*4882a593Smuzhiyun	bne		Lxts_dec6x_steal
3459*4882a593Smuzhiyun	b		Lxts_dec6x_done
3460*4882a593Smuzhiyun
3461*4882a593Smuzhiyun.align	4
3462*4882a593SmuzhiyunLxts_dec6x_four:
3463*4882a593Smuzhiyun	vxor		$out0,$in2,$twk0
3464*4882a593Smuzhiyun	vxor		$out1,$in3,$twk1
3465*4882a593Smuzhiyun	vxor		$out2,$in4,$twk2
3466*4882a593Smuzhiyun	vxor		$out3,$in5,$twk3
3467*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
3468*4882a593Smuzhiyun
3469*4882a593Smuzhiyun	bl		_aesp8_xts_dec5x
3470*4882a593Smuzhiyun
3471*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3472*4882a593Smuzhiyun	vmr		$twk0,$twk4		# unused tweak
3473*4882a593Smuzhiyun	vmr		$twk1,$twk5
3474*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
3475*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3476*4882a593Smuzhiyun	vxor		$out0,$in0,$twk5
3477*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
3478*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
3479*4882a593Smuzhiyun	le?vperm	$out3,$out3,$out3,$leperm
3480*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
3481*4882a593Smuzhiyun	stvx_u		$out3,$x30,$out
3482*4882a593Smuzhiyun	addi		$out,$out,0x40
3483*4882a593Smuzhiyun	bne		Lxts_dec6x_steal
3484*4882a593Smuzhiyun	b		Lxts_dec6x_done
3485*4882a593Smuzhiyun
3486*4882a593Smuzhiyun.align	4
3487*4882a593SmuzhiyunLxts_dec6x_three:
3488*4882a593Smuzhiyun	vxor		$out0,$in3,$twk0
3489*4882a593Smuzhiyun	vxor		$out1,$in4,$twk1
3490*4882a593Smuzhiyun	vxor		$out2,$in5,$twk2
3491*4882a593Smuzhiyun	vxor		$out3,$out3,$out3
3492*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
3493*4882a593Smuzhiyun
3494*4882a593Smuzhiyun	bl		_aesp8_xts_dec5x
3495*4882a593Smuzhiyun
3496*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3497*4882a593Smuzhiyun	vmr		$twk0,$twk3		# unused tweak
3498*4882a593Smuzhiyun	vmr		$twk1,$twk4
3499*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
3500*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3501*4882a593Smuzhiyun	vxor		$out0,$in0,$twk4
3502*4882a593Smuzhiyun	le?vperm	$out2,$out2,$out2,$leperm
3503*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
3504*4882a593Smuzhiyun	stvx_u		$out2,$x20,$out
3505*4882a593Smuzhiyun	addi		$out,$out,0x30
3506*4882a593Smuzhiyun	bne		Lxts_dec6x_steal
3507*4882a593Smuzhiyun	b		Lxts_dec6x_done
3508*4882a593Smuzhiyun
3509*4882a593Smuzhiyun.align	4
3510*4882a593SmuzhiyunLxts_dec6x_two:
3511*4882a593Smuzhiyun	vxor		$out0,$in4,$twk0
3512*4882a593Smuzhiyun	vxor		$out1,$in5,$twk1
3513*4882a593Smuzhiyun	vxor		$out2,$out2,$out2
3514*4882a593Smuzhiyun	vxor		$out3,$out3,$out3
3515*4882a593Smuzhiyun	vxor		$out4,$out4,$out4
3516*4882a593Smuzhiyun
3517*4882a593Smuzhiyun	bl		_aesp8_xts_dec5x
3518*4882a593Smuzhiyun
3519*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3520*4882a593Smuzhiyun	vmr		$twk0,$twk2		# unused tweak
3521*4882a593Smuzhiyun	vmr		$twk1,$twk3
3522*4882a593Smuzhiyun	le?vperm	$out1,$out1,$out1,$leperm
3523*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3524*4882a593Smuzhiyun	vxor		$out0,$in0,$twk3
3525*4882a593Smuzhiyun	stvx_u		$out1,$x10,$out
3526*4882a593Smuzhiyun	addi		$out,$out,0x20
3527*4882a593Smuzhiyun	bne		Lxts_dec6x_steal
3528*4882a593Smuzhiyun	b		Lxts_dec6x_done
3529*4882a593Smuzhiyun
3530*4882a593Smuzhiyun.align	4
3531*4882a593SmuzhiyunLxts_dec6x_one:
3532*4882a593Smuzhiyun	vxor		$out0,$in5,$twk0
3533*4882a593Smuzhiyun	nop
3534*4882a593SmuzhiyunLoop_xts_dec1x:
3535*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3536*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
3537*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3538*4882a593Smuzhiyun
3539*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3540*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
3541*4882a593Smuzhiyun	bdnz		Loop_xts_dec1x
3542*4882a593Smuzhiyun
3543*4882a593Smuzhiyun	subi		r0,$taillen,1
3544*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3545*4882a593Smuzhiyun
3546*4882a593Smuzhiyun	andi.		r0,r0,16
3547*4882a593Smuzhiyun	cmpwi		$taillen,0
3548*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3549*4882a593Smuzhiyun
3550*4882a593Smuzhiyun	sub		$inp,$inp,r0
3551*4882a593Smuzhiyun	vncipher	$out0,$out0,v26
3552*4882a593Smuzhiyun
3553*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
3554*4882a593Smuzhiyun	vncipher	$out0,$out0,v27
3555*4882a593Smuzhiyun
3556*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3557*4882a593Smuzhiyun	vncipher	$out0,$out0,v28
3558*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
3559*4882a593Smuzhiyun
3560*4882a593Smuzhiyun	vncipher	$out0,$out0,v29
3561*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
3562*4882a593Smuzhiyun	 vxor		$twk0,$twk0,v31
3563*4882a593Smuzhiyun
3564*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
3565*4882a593Smuzhiyun	vncipher	$out0,$out0,v30
3566*4882a593Smuzhiyun
3567*4882a593Smuzhiyun	mtctr		$rounds
3568*4882a593Smuzhiyun	vncipherlast	$out0,$out0,$twk0
3569*4882a593Smuzhiyun
3570*4882a593Smuzhiyun	vmr		$twk0,$twk1		# unused tweak
3571*4882a593Smuzhiyun	vmr		$twk1,$twk2
3572*4882a593Smuzhiyun	le?vperm	$out0,$out0,$out0,$leperm
3573*4882a593Smuzhiyun	stvx_u		$out0,$x00,$out		# store output
3574*4882a593Smuzhiyun	addi		$out,$out,0x10
3575*4882a593Smuzhiyun	vxor		$out0,$in0,$twk2
3576*4882a593Smuzhiyun	bne		Lxts_dec6x_steal
3577*4882a593Smuzhiyun	b		Lxts_dec6x_done
3578*4882a593Smuzhiyun
3579*4882a593Smuzhiyun.align	4
3580*4882a593SmuzhiyunLxts_dec6x_zero:
3581*4882a593Smuzhiyun	cmpwi		$taillen,0
3582*4882a593Smuzhiyun	beq		Lxts_dec6x_done
3583*4882a593Smuzhiyun
3584*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
3585*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
3586*4882a593Smuzhiyun	vxor		$out0,$in0,$twk1
3587*4882a593SmuzhiyunLxts_dec6x_steal:
3588*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3589*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
3590*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3591*4882a593Smuzhiyun
3592*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3593*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
3594*4882a593Smuzhiyun	bdnz		Lxts_dec6x_steal
3595*4882a593Smuzhiyun
3596*4882a593Smuzhiyun	add		$inp,$inp,$taillen
3597*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3598*4882a593Smuzhiyun
3599*4882a593Smuzhiyun	cmpwi		$taillen,0
3600*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3601*4882a593Smuzhiyun
3602*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
3603*4882a593Smuzhiyun	vncipher	$out0,$out0,v26
3604*4882a593Smuzhiyun
3605*4882a593Smuzhiyun	lvsr		$inpperm,0,$taillen	# $in5 is no more
3606*4882a593Smuzhiyun	vncipher	$out0,$out0,v27
3607*4882a593Smuzhiyun
3608*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3609*4882a593Smuzhiyun	vncipher	$out0,$out0,v28
3610*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
3611*4882a593Smuzhiyun
3612*4882a593Smuzhiyun	vncipher	$out0,$out0,v29
3613*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
3614*4882a593Smuzhiyun	 vxor		$twk1,$twk1,v31
3615*4882a593Smuzhiyun
3616*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
3617*4882a593Smuzhiyun	vncipher	$out0,$out0,v30
3618*4882a593Smuzhiyun
3619*4882a593Smuzhiyun	vperm		$in0,$in0,$in0,$inpperm
3620*4882a593Smuzhiyun	vncipherlast	$tmp,$out0,$twk1
3621*4882a593Smuzhiyun
3622*4882a593Smuzhiyun	le?vperm	$out0,$tmp,$tmp,$leperm
3623*4882a593Smuzhiyun	le?stvx_u	$out0,0,$out
3624*4882a593Smuzhiyun	be?stvx_u	$tmp,0,$out
3625*4882a593Smuzhiyun
3626*4882a593Smuzhiyun	vxor		$out0,$out0,$out0
3627*4882a593Smuzhiyun	vspltisb	$out1,-1
3628*4882a593Smuzhiyun	vperm		$out0,$out0,$out1,$inpperm
3629*4882a593Smuzhiyun	vsel		$out0,$in0,$tmp,$out0
3630*4882a593Smuzhiyun	vxor		$out0,$out0,$twk0
3631*4882a593Smuzhiyun
3632*4882a593Smuzhiyun	subi		r30,$out,1
3633*4882a593Smuzhiyun	mtctr		$taillen
3634*4882a593SmuzhiyunLoop_xts_dec6x_steal:
3635*4882a593Smuzhiyun	lbzu		r0,1(r30)
3636*4882a593Smuzhiyun	stb		r0,16(r30)
3637*4882a593Smuzhiyun	bdnz		Loop_xts_dec6x_steal
3638*4882a593Smuzhiyun
3639*4882a593Smuzhiyun	li		$taillen,0
3640*4882a593Smuzhiyun	mtctr		$rounds
3641*4882a593Smuzhiyun	b		Loop_xts_dec1x		# one more time...
3642*4882a593Smuzhiyun
3643*4882a593Smuzhiyun.align	4
3644*4882a593SmuzhiyunLxts_dec6x_done:
3645*4882a593Smuzhiyun	${UCMP}i	$ivp,0
3646*4882a593Smuzhiyun	beq		Lxts_dec6x_ret
3647*4882a593Smuzhiyun
3648*4882a593Smuzhiyun	vxor		$tweak,$twk0,$rndkey0
3649*4882a593Smuzhiyun	le?vperm	$tweak,$tweak,$tweak,$leperm
3650*4882a593Smuzhiyun	stvx_u		$tweak,0,$ivp
3651*4882a593Smuzhiyun
3652*4882a593SmuzhiyunLxts_dec6x_ret:
3653*4882a593Smuzhiyun	mtlr		r11
3654*4882a593Smuzhiyun	li		r10,`$FRAME+15`
3655*4882a593Smuzhiyun	li		r11,`$FRAME+31`
3656*4882a593Smuzhiyun	stvx		$seven,r10,$sp		# wipe copies of round keys
3657*4882a593Smuzhiyun	addi		r10,r10,32
3658*4882a593Smuzhiyun	stvx		$seven,r11,$sp
3659*4882a593Smuzhiyun	addi		r11,r11,32
3660*4882a593Smuzhiyun	stvx		$seven,r10,$sp
3661*4882a593Smuzhiyun	addi		r10,r10,32
3662*4882a593Smuzhiyun	stvx		$seven,r11,$sp
3663*4882a593Smuzhiyun	addi		r11,r11,32
3664*4882a593Smuzhiyun	stvx		$seven,r10,$sp
3665*4882a593Smuzhiyun	addi		r10,r10,32
3666*4882a593Smuzhiyun	stvx		$seven,r11,$sp
3667*4882a593Smuzhiyun	addi		r11,r11,32
3668*4882a593Smuzhiyun	stvx		$seven,r10,$sp
3669*4882a593Smuzhiyun	addi		r10,r10,32
3670*4882a593Smuzhiyun	stvx		$seven,r11,$sp
3671*4882a593Smuzhiyun	addi		r11,r11,32
3672*4882a593Smuzhiyun
3673*4882a593Smuzhiyun	mtspr		256,$vrsave
3674*4882a593Smuzhiyun	lvx		v20,r10,$sp		# ABI says so
3675*4882a593Smuzhiyun	addi		r10,r10,32
3676*4882a593Smuzhiyun	lvx		v21,r11,$sp
3677*4882a593Smuzhiyun	addi		r11,r11,32
3678*4882a593Smuzhiyun	lvx		v22,r10,$sp
3679*4882a593Smuzhiyun	addi		r10,r10,32
3680*4882a593Smuzhiyun	lvx		v23,r11,$sp
3681*4882a593Smuzhiyun	addi		r11,r11,32
3682*4882a593Smuzhiyun	lvx		v24,r10,$sp
3683*4882a593Smuzhiyun	addi		r10,r10,32
3684*4882a593Smuzhiyun	lvx		v25,r11,$sp
3685*4882a593Smuzhiyun	addi		r11,r11,32
3686*4882a593Smuzhiyun	lvx		v26,r10,$sp
3687*4882a593Smuzhiyun	addi		r10,r10,32
3688*4882a593Smuzhiyun	lvx		v27,r11,$sp
3689*4882a593Smuzhiyun	addi		r11,r11,32
3690*4882a593Smuzhiyun	lvx		v28,r10,$sp
3691*4882a593Smuzhiyun	addi		r10,r10,32
3692*4882a593Smuzhiyun	lvx		v29,r11,$sp
3693*4882a593Smuzhiyun	addi		r11,r11,32
3694*4882a593Smuzhiyun	lvx		v30,r10,$sp
3695*4882a593Smuzhiyun	lvx		v31,r11,$sp
3696*4882a593Smuzhiyun	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3697*4882a593Smuzhiyun	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3698*4882a593Smuzhiyun	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3699*4882a593Smuzhiyun	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3700*4882a593Smuzhiyun	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3701*4882a593Smuzhiyun	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3702*4882a593Smuzhiyun	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3703*4882a593Smuzhiyun	blr
3704*4882a593Smuzhiyun	.long		0
3705*4882a593Smuzhiyun	.byte		0,12,0x04,1,0x80,6,6,0
3706*4882a593Smuzhiyun	.long		0
3707*4882a593Smuzhiyun
3708*4882a593Smuzhiyun.align	5
3709*4882a593Smuzhiyun_aesp8_xts_dec5x:
3710*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3711*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
3712*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
3713*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
3714*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
3715*4882a593Smuzhiyun	lvx		v24,$x20,$key_		# round[3]
3716*4882a593Smuzhiyun	addi		$key_,$key_,0x20
3717*4882a593Smuzhiyun
3718*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3719*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
3720*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
3721*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
3722*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
3723*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# round[4]
3724*4882a593Smuzhiyun	bdnz		_aesp8_xts_dec5x
3725*4882a593Smuzhiyun
3726*4882a593Smuzhiyun	subi		r0,$taillen,1
3727*4882a593Smuzhiyun	vncipher	$out0,$out0,v24
3728*4882a593Smuzhiyun	vncipher	$out1,$out1,v24
3729*4882a593Smuzhiyun	vncipher	$out2,$out2,v24
3730*4882a593Smuzhiyun	vncipher	$out3,$out3,v24
3731*4882a593Smuzhiyun	vncipher	$out4,$out4,v24
3732*4882a593Smuzhiyun
3733*4882a593Smuzhiyun	andi.		r0,r0,16
3734*4882a593Smuzhiyun	cmpwi		$taillen,0
3735*4882a593Smuzhiyun	vncipher	$out0,$out0,v25
3736*4882a593Smuzhiyun	vncipher	$out1,$out1,v25
3737*4882a593Smuzhiyun	vncipher	$out2,$out2,v25
3738*4882a593Smuzhiyun	vncipher	$out3,$out3,v25
3739*4882a593Smuzhiyun	vncipher	$out4,$out4,v25
3740*4882a593Smuzhiyun	 vxor		$twk0,$twk0,v31
3741*4882a593Smuzhiyun
3742*4882a593Smuzhiyun	sub		$inp,$inp,r0
3743*4882a593Smuzhiyun	vncipher	$out0,$out0,v26
3744*4882a593Smuzhiyun	vncipher	$out1,$out1,v26
3745*4882a593Smuzhiyun	vncipher	$out2,$out2,v26
3746*4882a593Smuzhiyun	vncipher	$out3,$out3,v26
3747*4882a593Smuzhiyun	vncipher	$out4,$out4,v26
3748*4882a593Smuzhiyun	 vxor		$in1,$twk1,v31
3749*4882a593Smuzhiyun
3750*4882a593Smuzhiyun	vncipher	$out0,$out0,v27
3751*4882a593Smuzhiyun	lvx_u		$in0,0,$inp
3752*4882a593Smuzhiyun	vncipher	$out1,$out1,v27
3753*4882a593Smuzhiyun	vncipher	$out2,$out2,v27
3754*4882a593Smuzhiyun	vncipher	$out3,$out3,v27
3755*4882a593Smuzhiyun	vncipher	$out4,$out4,v27
3756*4882a593Smuzhiyun	 vxor		$in2,$twk2,v31
3757*4882a593Smuzhiyun
3758*4882a593Smuzhiyun	addi		$key_,$sp,$FRAME+15	# rewind $key_
3759*4882a593Smuzhiyun	vncipher	$out0,$out0,v28
3760*4882a593Smuzhiyun	vncipher	$out1,$out1,v28
3761*4882a593Smuzhiyun	vncipher	$out2,$out2,v28
3762*4882a593Smuzhiyun	vncipher	$out3,$out3,v28
3763*4882a593Smuzhiyun	vncipher	$out4,$out4,v28
3764*4882a593Smuzhiyun	lvx		v24,$x00,$key_		# re-pre-load round[1]
3765*4882a593Smuzhiyun	 vxor		$in3,$twk3,v31
3766*4882a593Smuzhiyun
3767*4882a593Smuzhiyun	vncipher	$out0,$out0,v29
3768*4882a593Smuzhiyun	le?vperm	$in0,$in0,$in0,$leperm
3769*4882a593Smuzhiyun	vncipher	$out1,$out1,v29
3770*4882a593Smuzhiyun	vncipher	$out2,$out2,v29
3771*4882a593Smuzhiyun	vncipher	$out3,$out3,v29
3772*4882a593Smuzhiyun	vncipher	$out4,$out4,v29
3773*4882a593Smuzhiyun	lvx		v25,$x10,$key_		# re-pre-load round[2]
3774*4882a593Smuzhiyun	 vxor		$in4,$twk4,v31
3775*4882a593Smuzhiyun
3776*4882a593Smuzhiyun	vncipher	$out0,$out0,v30
3777*4882a593Smuzhiyun	vncipher	$out1,$out1,v30
3778*4882a593Smuzhiyun	vncipher	$out2,$out2,v30
3779*4882a593Smuzhiyun	vncipher	$out3,$out3,v30
3780*4882a593Smuzhiyun	vncipher	$out4,$out4,v30
3781*4882a593Smuzhiyun
3782*4882a593Smuzhiyun	vncipherlast	$out0,$out0,$twk0
3783*4882a593Smuzhiyun	vncipherlast	$out1,$out1,$in1
3784*4882a593Smuzhiyun	vncipherlast	$out2,$out2,$in2
3785*4882a593Smuzhiyun	vncipherlast	$out3,$out3,$in3
3786*4882a593Smuzhiyun	vncipherlast	$out4,$out4,$in4
3787*4882a593Smuzhiyun	mtctr		$rounds
3788*4882a593Smuzhiyun	blr
3789*4882a593Smuzhiyun        .long   	0
3790*4882a593Smuzhiyun        .byte   	0,12,0x14,0,0,0,0,0
3791*4882a593Smuzhiyun___
3792*4882a593Smuzhiyun}}	}}}
3793*4882a593Smuzhiyun
3794*4882a593Smuzhiyunmy $consts=1;
3795*4882a593Smuzhiyunforeach(split("\n",$code)) {
3796*4882a593Smuzhiyun        s/\`([^\`]*)\`/eval($1)/geo;
3797*4882a593Smuzhiyun
3798*4882a593Smuzhiyun	# constants table endian-specific conversion
3799*4882a593Smuzhiyun	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3800*4882a593Smuzhiyun	    my $conv=$3;
3801*4882a593Smuzhiyun	    my @bytes=();
3802*4882a593Smuzhiyun
3803*4882a593Smuzhiyun	    # convert to endian-agnostic format
3804*4882a593Smuzhiyun	    if ($1 eq "long") {
3805*4882a593Smuzhiyun	      foreach (split(/,\s*/,$2)) {
3806*4882a593Smuzhiyun		my $l = /^0/?oct:int;
3807*4882a593Smuzhiyun		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3808*4882a593Smuzhiyun	      }
3809*4882a593Smuzhiyun	    } else {
3810*4882a593Smuzhiyun		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3811*4882a593Smuzhiyun	    }
3812*4882a593Smuzhiyun
3813*4882a593Smuzhiyun	    # little-endian conversion
3814*4882a593Smuzhiyun	    if ($flavour =~ /le$/o) {
3815*4882a593Smuzhiyun		SWITCH: for($conv)  {
3816*4882a593Smuzhiyun		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3817*4882a593Smuzhiyun		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3818*4882a593Smuzhiyun		}
3819*4882a593Smuzhiyun	    }
3820*4882a593Smuzhiyun
3821*4882a593Smuzhiyun	    #emit
3822*4882a593Smuzhiyun	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3823*4882a593Smuzhiyun	    next;
3824*4882a593Smuzhiyun	}
3825*4882a593Smuzhiyun	$consts=0 if (m/Lconsts:/o);	# end of table
3826*4882a593Smuzhiyun
3827*4882a593Smuzhiyun	# instructions prefixed with '?' are endian-specific and need
3828*4882a593Smuzhiyun	# to be adjusted accordingly...
3829*4882a593Smuzhiyun	if ($flavour =~ /le$/o) {	# little-endian
3830*4882a593Smuzhiyun	    s/le\?//o		or
3831*4882a593Smuzhiyun	    s/be\?/#be#/o	or
3832*4882a593Smuzhiyun	    s/\?lvsr/lvsl/o	or
3833*4882a593Smuzhiyun	    s/\?lvsl/lvsr/o	or
3834*4882a593Smuzhiyun	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3835*4882a593Smuzhiyun	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3836*4882a593Smuzhiyun	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3837*4882a593Smuzhiyun	} else {			# big-endian
3838*4882a593Smuzhiyun	    s/le\?/#le#/o	or
3839*4882a593Smuzhiyun	    s/be\?//o		or
3840*4882a593Smuzhiyun	    s/\?([a-z]+)/$1/o;
3841*4882a593Smuzhiyun	}
3842*4882a593Smuzhiyun
3843*4882a593Smuzhiyun        print $_,"\n";
3844*4882a593Smuzhiyun}
3845*4882a593Smuzhiyun
3846*4882a593Smuzhiyunclose STDOUT;
3847