xref: /OK3568_Linux_fs/kernel/drivers/crypto/vmx/ghashp8-ppc.pl (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun#!/usr/bin/env perl
2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0
3*4882a593Smuzhiyun
4*4882a593Smuzhiyun# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5*4882a593Smuzhiyun# has relicensed it under the GPLv2. Therefore this program is free software;
6*4882a593Smuzhiyun# you can redistribute it and/or modify it under the terms of the GNU General
7*4882a593Smuzhiyun# Public License version 2 as published by the Free Software Foundation.
8*4882a593Smuzhiyun#
9*4882a593Smuzhiyun# The original headers, including the original license headers, are
10*4882a593Smuzhiyun# included below for completeness.
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun# ====================================================================
13*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and
15*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further
16*4882a593Smuzhiyun# details see https://www.openssl.org/~appro/cryptogams/.
17*4882a593Smuzhiyun# ====================================================================
18*4882a593Smuzhiyun#
19*4882a593Smuzhiyun# GHASH for for PowerISA v2.07.
20*4882a593Smuzhiyun#
21*4882a593Smuzhiyun# July 2014
22*4882a593Smuzhiyun#
23*4882a593Smuzhiyun# Accurate performance measurements are problematic, because it's
24*4882a593Smuzhiyun# always virtualized setup with possibly throttled processor.
25*4882a593Smuzhiyun# Relative comparison is therefore more informative. This initial
26*4882a593Smuzhiyun# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27*4882a593Smuzhiyun# faster than "4-bit" integer-only compiler-generated 64-bit code.
28*4882a593Smuzhiyun# "Initial version" means that there is room for futher improvement.
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun$flavour=shift;
31*4882a593Smuzhiyun$output =shift;
32*4882a593Smuzhiyun
33*4882a593Smuzhiyunif ($flavour =~ /64/) {
34*4882a593Smuzhiyun	$SIZE_T=8;
35*4882a593Smuzhiyun	$LRSAVE=2*$SIZE_T;
36*4882a593Smuzhiyun	$STU="stdu";
37*4882a593Smuzhiyun	$POP="ld";
38*4882a593Smuzhiyun	$PUSH="std";
39*4882a593Smuzhiyun} elsif ($flavour =~ /32/) {
40*4882a593Smuzhiyun	$SIZE_T=4;
41*4882a593Smuzhiyun	$LRSAVE=$SIZE_T;
42*4882a593Smuzhiyun	$STU="stwu";
43*4882a593Smuzhiyun	$POP="lwz";
44*4882a593Smuzhiyun	$PUSH="stw";
45*4882a593Smuzhiyun} else { die "nonsense $flavour"; }
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48*4882a593Smuzhiyun( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
49*4882a593Smuzhiyun( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
50*4882a593Smuzhiyundie "can't locate ppc-xlate.pl";
51*4882a593Smuzhiyun
52*4882a593Smuzhiyunopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
53*4882a593Smuzhiyun
54*4882a593Smuzhiyunmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
55*4882a593Smuzhiyun
56*4882a593Smuzhiyunmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
57*4882a593Smuzhiyunmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
58*4882a593Smuzhiyunmy $vrsave="r12";
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun$code=<<___;
61*4882a593Smuzhiyun.machine	"any"
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun.text
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun.globl	.gcm_init_p8
66*4882a593Smuzhiyun	lis		r0,0xfff0
67*4882a593Smuzhiyun	li		r8,0x10
68*4882a593Smuzhiyun	mfspr		$vrsave,256
69*4882a593Smuzhiyun	li		r9,0x20
70*4882a593Smuzhiyun	mtspr		256,r0
71*4882a593Smuzhiyun	li		r10,0x30
72*4882a593Smuzhiyun	lvx_u		$H,0,r4			# load H
73*4882a593Smuzhiyun	le?xor		r7,r7,r7
74*4882a593Smuzhiyun	le?addi		r7,r7,0x8		# need a vperm start with 08
75*4882a593Smuzhiyun	le?lvsr		5,0,r7
76*4882a593Smuzhiyun	le?vspltisb	6,0x0f
77*4882a593Smuzhiyun	le?vxor		5,5,6			# set a b-endian mask
78*4882a593Smuzhiyun	le?vperm	$H,$H,$H,5
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun	vspltisb	$xC2,-16		# 0xf0
81*4882a593Smuzhiyun	vspltisb	$t0,1			# one
82*4882a593Smuzhiyun	vaddubm		$xC2,$xC2,$xC2		# 0xe0
83*4882a593Smuzhiyun	vxor		$zero,$zero,$zero
84*4882a593Smuzhiyun	vor		$xC2,$xC2,$t0		# 0xe1
85*4882a593Smuzhiyun	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
86*4882a593Smuzhiyun	vsldoi		$t1,$zero,$t0,1		# ...1
87*4882a593Smuzhiyun	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
88*4882a593Smuzhiyun	vspltisb	$t2,7
89*4882a593Smuzhiyun	vor		$xC2,$xC2,$t1		# 0xc2....01
90*4882a593Smuzhiyun	vspltb		$t1,$H,0		# most significant byte
91*4882a593Smuzhiyun	vsl		$H,$H,$t0		# H<<=1
92*4882a593Smuzhiyun	vsrab		$t1,$t1,$t2		# broadcast carry bit
93*4882a593Smuzhiyun	vand		$t1,$t1,$xC2
94*4882a593Smuzhiyun	vxor		$H,$H,$t1		# twisted H
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	vsldoi		$H,$H,$H,8		# twist even more ...
97*4882a593Smuzhiyun	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
98*4882a593Smuzhiyun	vsldoi		$Hl,$zero,$H,8		# ... and split
99*4882a593Smuzhiyun	vsldoi		$Hh,$H,$zero,8
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun	stvx_u		$xC2,0,r3		# save pre-computed table
102*4882a593Smuzhiyun	stvx_u		$Hl,r8,r3
103*4882a593Smuzhiyun	stvx_u		$H, r9,r3
104*4882a593Smuzhiyun	stvx_u		$Hh,r10,r3
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun	mtspr		256,$vrsave
107*4882a593Smuzhiyun	blr
108*4882a593Smuzhiyun	.long		0
109*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,2,0
110*4882a593Smuzhiyun	.long		0
111*4882a593Smuzhiyun.size	.gcm_init_p8,.-.gcm_init_p8
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun.globl	.gcm_gmult_p8
114*4882a593Smuzhiyun	lis		r0,0xfff8
115*4882a593Smuzhiyun	li		r8,0x10
116*4882a593Smuzhiyun	mfspr		$vrsave,256
117*4882a593Smuzhiyun	li		r9,0x20
118*4882a593Smuzhiyun	mtspr		256,r0
119*4882a593Smuzhiyun	li		r10,0x30
120*4882a593Smuzhiyun	lvx_u		$IN,0,$Xip		# load Xi
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
123*4882a593Smuzhiyun	 le?lvsl	$lemask,r0,r0
124*4882a593Smuzhiyun	lvx_u		$H, r9,$Htbl
125*4882a593Smuzhiyun	 le?vspltisb	$t0,0x07
126*4882a593Smuzhiyun	lvx_u		$Hh,r10,$Htbl
127*4882a593Smuzhiyun	 le?vxor	$lemask,$lemask,$t0
128*4882a593Smuzhiyun	lvx_u		$xC2,0,$Htbl
129*4882a593Smuzhiyun	 le?vperm	$IN,$IN,$IN,$lemask
130*4882a593Smuzhiyun	vxor		$zero,$zero,$zero
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
133*4882a593Smuzhiyun	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
134*4882a593Smuzhiyun	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun	vpmsumd		$t2,$Xl,$xC2		# 1st phase
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun	vsldoi		$t0,$Xm,$zero,8
139*4882a593Smuzhiyun	vsldoi		$t1,$zero,$Xm,8
140*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t0
141*4882a593Smuzhiyun	vxor		$Xh,$Xh,$t1
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun	vsldoi		$Xl,$Xl,$Xl,8
144*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t2
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
147*4882a593Smuzhiyun	vpmsumd		$Xl,$Xl,$xC2
148*4882a593Smuzhiyun	vxor		$t1,$t1,$Xh
149*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t1
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	le?vperm	$Xl,$Xl,$Xl,$lemask
152*4882a593Smuzhiyun	stvx_u		$Xl,0,$Xip		# write out Xi
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun	mtspr		256,$vrsave
155*4882a593Smuzhiyun	blr
156*4882a593Smuzhiyun	.long		0
157*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,2,0
158*4882a593Smuzhiyun	.long		0
159*4882a593Smuzhiyun.size	.gcm_gmult_p8,.-.gcm_gmult_p8
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun.globl	.gcm_ghash_p8
162*4882a593Smuzhiyun	lis		r0,0xfff8
163*4882a593Smuzhiyun	li		r8,0x10
164*4882a593Smuzhiyun	mfspr		$vrsave,256
165*4882a593Smuzhiyun	li		r9,0x20
166*4882a593Smuzhiyun	mtspr		256,r0
167*4882a593Smuzhiyun	li		r10,0x30
168*4882a593Smuzhiyun	lvx_u		$Xl,0,$Xip		# load Xi
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
171*4882a593Smuzhiyun	 le?lvsl	$lemask,r0,r0
172*4882a593Smuzhiyun	lvx_u		$H, r9,$Htbl
173*4882a593Smuzhiyun	 le?vspltisb	$t0,0x07
174*4882a593Smuzhiyun	lvx_u		$Hh,r10,$Htbl
175*4882a593Smuzhiyun	 le?vxor	$lemask,$lemask,$t0
176*4882a593Smuzhiyun	lvx_u		$xC2,0,$Htbl
177*4882a593Smuzhiyun	 le?vperm	$Xl,$Xl,$Xl,$lemask
178*4882a593Smuzhiyun	vxor		$zero,$zero,$zero
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun	lvx_u		$IN,0,$inp
181*4882a593Smuzhiyun	addi		$inp,$inp,16
182*4882a593Smuzhiyun	subi		$len,$len,16
183*4882a593Smuzhiyun	 le?vperm	$IN,$IN,$IN,$lemask
184*4882a593Smuzhiyun	vxor		$IN,$IN,$Xl
185*4882a593Smuzhiyun	b		Loop
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun.align	5
188*4882a593SmuzhiyunLoop:
189*4882a593Smuzhiyun	 subic		$len,$len,16
190*4882a593Smuzhiyun	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
191*4882a593Smuzhiyun	 subfe.		r0,r0,r0		# borrow?-1:0
192*4882a593Smuzhiyun	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
193*4882a593Smuzhiyun	 and		r0,r0,$len
194*4882a593Smuzhiyun	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
195*4882a593Smuzhiyun	 add		$inp,$inp,r0
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun	vpmsumd		$t2,$Xl,$xC2		# 1st phase
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun	vsldoi		$t0,$Xm,$zero,8
200*4882a593Smuzhiyun	vsldoi		$t1,$zero,$Xm,8
201*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t0
202*4882a593Smuzhiyun	vxor		$Xh,$Xh,$t1
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun	vsldoi		$Xl,$Xl,$Xl,8
205*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t2
206*4882a593Smuzhiyun	 lvx_u		$IN,0,$inp
207*4882a593Smuzhiyun	 addi		$inp,$inp,16
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
210*4882a593Smuzhiyun	vpmsumd		$Xl,$Xl,$xC2
211*4882a593Smuzhiyun	 le?vperm	$IN,$IN,$IN,$lemask
212*4882a593Smuzhiyun	vxor		$t1,$t1,$Xh
213*4882a593Smuzhiyun	vxor		$IN,$IN,$t1
214*4882a593Smuzhiyun	vxor		$IN,$IN,$Xl
215*4882a593Smuzhiyun	beq		Loop			# did $len-=16 borrow?
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun	vxor		$Xl,$Xl,$t1
218*4882a593Smuzhiyun	le?vperm	$Xl,$Xl,$Xl,$lemask
219*4882a593Smuzhiyun	stvx_u		$Xl,0,$Xip		# write out Xi
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	mtspr		256,$vrsave
222*4882a593Smuzhiyun	blr
223*4882a593Smuzhiyun	.long		0
224*4882a593Smuzhiyun	.byte		0,12,0x14,0,0,0,4,0
225*4882a593Smuzhiyun	.long		0
226*4882a593Smuzhiyun.size	.gcm_ghash_p8,.-.gcm_ghash_p8
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
229*4882a593Smuzhiyun.align  2
230*4882a593Smuzhiyun___
231*4882a593Smuzhiyun
232*4882a593Smuzhiyunforeach (split("\n",$code)) {
233*4882a593Smuzhiyun	if ($flavour =~ /le$/o) {	# little-endian
234*4882a593Smuzhiyun	    s/le\?//o		or
235*4882a593Smuzhiyun	    s/be\?/#be#/o;
236*4882a593Smuzhiyun	} else {
237*4882a593Smuzhiyun	    s/le\?/#le#/o	or
238*4882a593Smuzhiyun	    s/be\?//o;
239*4882a593Smuzhiyun	}
240*4882a593Smuzhiyun	print $_,"\n";
241*4882a593Smuzhiyun}
242*4882a593Smuzhiyun
243*4882a593Smuzhiyunclose STDOUT; # enforce flush
244