1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0 3*4882a593Smuzhiyun 4*4882a593Smuzhiyun# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5*4882a593Smuzhiyun# has relicensed it under the GPLv2. Therefore this program is free software; 6*4882a593Smuzhiyun# you can redistribute it and/or modify it under the terms of the GNU General 7*4882a593Smuzhiyun# Public License version 2 as published by the Free Software Foundation. 8*4882a593Smuzhiyun# 9*4882a593Smuzhiyun# The original headers, including the original license headers, are 10*4882a593Smuzhiyun# included below for completeness. 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun# ==================================================================== 13*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and 15*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further 16*4882a593Smuzhiyun# details see https://www.openssl.org/~appro/cryptogams/. 17*4882a593Smuzhiyun# ==================================================================== 18*4882a593Smuzhiyun# 19*4882a593Smuzhiyun# GHASH for for PowerISA v2.07. 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# July 2014 22*4882a593Smuzhiyun# 23*4882a593Smuzhiyun# Accurate performance measurements are problematic, because it's 24*4882a593Smuzhiyun# always virtualized setup with possibly throttled processor. 25*4882a593Smuzhiyun# Relative comparison is therefore more informative. This initial 26*4882a593Smuzhiyun# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x 27*4882a593Smuzhiyun# faster than "4-bit" integer-only compiler-generated 64-bit code. 28*4882a593Smuzhiyun# "Initial version" means that there is room for futher improvement. 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun$flavour=shift; 31*4882a593Smuzhiyun$output =shift; 32*4882a593Smuzhiyun 33*4882a593Smuzhiyunif ($flavour =~ /64/) { 34*4882a593Smuzhiyun $SIZE_T=8; 35*4882a593Smuzhiyun $LRSAVE=2*$SIZE_T; 36*4882a593Smuzhiyun $STU="stdu"; 37*4882a593Smuzhiyun $POP="ld"; 38*4882a593Smuzhiyun $PUSH="std"; 39*4882a593Smuzhiyun} elsif ($flavour =~ /32/) { 40*4882a593Smuzhiyun $SIZE_T=4; 41*4882a593Smuzhiyun $LRSAVE=$SIZE_T; 42*4882a593Smuzhiyun $STU="stwu"; 43*4882a593Smuzhiyun $POP="lwz"; 44*4882a593Smuzhiyun $PUSH="stw"; 45*4882a593Smuzhiyun} else { die "nonsense $flavour"; } 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 48*4882a593Smuzhiyun( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 49*4882a593Smuzhiyun( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 50*4882a593Smuzhiyundie "can't locate ppc-xlate.pl"; 51*4882a593Smuzhiyun 52*4882a593Smuzhiyunopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 53*4882a593Smuzhiyun 54*4882a593Smuzhiyunmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block 55*4882a593Smuzhiyun 56*4882a593Smuzhiyunmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); 57*4882a593Smuzhiyunmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); 58*4882a593Smuzhiyunmy $vrsave="r12"; 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun$code=<<___; 61*4882a593Smuzhiyun.machine "any" 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun.text 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun.globl .gcm_init_p8 66*4882a593Smuzhiyun lis r0,0xfff0 67*4882a593Smuzhiyun li r8,0x10 68*4882a593Smuzhiyun mfspr $vrsave,256 69*4882a593Smuzhiyun li r9,0x20 70*4882a593Smuzhiyun mtspr 256,r0 71*4882a593Smuzhiyun li r10,0x30 72*4882a593Smuzhiyun lvx_u $H,0,r4 # load H 73*4882a593Smuzhiyun le?xor r7,r7,r7 74*4882a593Smuzhiyun le?addi r7,r7,0x8 # need a vperm start with 08 75*4882a593Smuzhiyun le?lvsr 5,0,r7 76*4882a593Smuzhiyun le?vspltisb 6,0x0f 77*4882a593Smuzhiyun le?vxor 5,5,6 # set a b-endian mask 78*4882a593Smuzhiyun le?vperm $H,$H,$H,5 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun vspltisb $xC2,-16 # 0xf0 81*4882a593Smuzhiyun vspltisb $t0,1 # one 82*4882a593Smuzhiyun vaddubm $xC2,$xC2,$xC2 # 0xe0 83*4882a593Smuzhiyun vxor $zero,$zero,$zero 84*4882a593Smuzhiyun vor $xC2,$xC2,$t0 # 0xe1 85*4882a593Smuzhiyun vsldoi $xC2,$xC2,$zero,15 # 0xe1... 86*4882a593Smuzhiyun vsldoi $t1,$zero,$t0,1 # ...1 87*4882a593Smuzhiyun vaddubm $xC2,$xC2,$xC2 # 0xc2... 88*4882a593Smuzhiyun vspltisb $t2,7 89*4882a593Smuzhiyun vor $xC2,$xC2,$t1 # 0xc2....01 90*4882a593Smuzhiyun vspltb $t1,$H,0 # most significant byte 91*4882a593Smuzhiyun vsl $H,$H,$t0 # H<<=1 92*4882a593Smuzhiyun vsrab $t1,$t1,$t2 # broadcast carry bit 93*4882a593Smuzhiyun vand $t1,$t1,$xC2 94*4882a593Smuzhiyun vxor $H,$H,$t1 # twisted H 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun vsldoi $H,$H,$H,8 # twist even more ... 97*4882a593Smuzhiyun vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 98*4882a593Smuzhiyun vsldoi $Hl,$zero,$H,8 # ... and split 99*4882a593Smuzhiyun vsldoi $Hh,$H,$zero,8 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun stvx_u $xC2,0,r3 # save pre-computed table 102*4882a593Smuzhiyun stvx_u $Hl,r8,r3 103*4882a593Smuzhiyun stvx_u $H, r9,r3 104*4882a593Smuzhiyun stvx_u $Hh,r10,r3 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun mtspr 256,$vrsave 107*4882a593Smuzhiyun blr 108*4882a593Smuzhiyun .long 0 109*4882a593Smuzhiyun .byte 0,12,0x14,0,0,0,2,0 110*4882a593Smuzhiyun .long 0 111*4882a593Smuzhiyun.size .gcm_init_p8,.-.gcm_init_p8 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun.globl .gcm_gmult_p8 114*4882a593Smuzhiyun lis r0,0xfff8 115*4882a593Smuzhiyun li r8,0x10 116*4882a593Smuzhiyun mfspr $vrsave,256 117*4882a593Smuzhiyun li r9,0x20 118*4882a593Smuzhiyun mtspr 256,r0 119*4882a593Smuzhiyun li r10,0x30 120*4882a593Smuzhiyun lvx_u $IN,0,$Xip # load Xi 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun lvx_u $Hl,r8,$Htbl # load pre-computed table 123*4882a593Smuzhiyun le?lvsl $lemask,r0,r0 124*4882a593Smuzhiyun lvx_u $H, r9,$Htbl 125*4882a593Smuzhiyun le?vspltisb $t0,0x07 126*4882a593Smuzhiyun lvx_u $Hh,r10,$Htbl 127*4882a593Smuzhiyun le?vxor $lemask,$lemask,$t0 128*4882a593Smuzhiyun lvx_u $xC2,0,$Htbl 129*4882a593Smuzhiyun le?vperm $IN,$IN,$IN,$lemask 130*4882a593Smuzhiyun vxor $zero,$zero,$zero 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 133*4882a593Smuzhiyun vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 134*4882a593Smuzhiyun vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun vpmsumd $t2,$Xl,$xC2 # 1st phase 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun vsldoi $t0,$Xm,$zero,8 139*4882a593Smuzhiyun vsldoi $t1,$zero,$Xm,8 140*4882a593Smuzhiyun vxor $Xl,$Xl,$t0 141*4882a593Smuzhiyun vxor $Xh,$Xh,$t1 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun vsldoi $Xl,$Xl,$Xl,8 144*4882a593Smuzhiyun vxor $Xl,$Xl,$t2 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun vsldoi $t1,$Xl,$Xl,8 # 2nd phase 147*4882a593Smuzhiyun vpmsumd $Xl,$Xl,$xC2 148*4882a593Smuzhiyun vxor $t1,$t1,$Xh 149*4882a593Smuzhiyun vxor $Xl,$Xl,$t1 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun le?vperm $Xl,$Xl,$Xl,$lemask 152*4882a593Smuzhiyun stvx_u $Xl,0,$Xip # write out Xi 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun mtspr 256,$vrsave 155*4882a593Smuzhiyun blr 156*4882a593Smuzhiyun .long 0 157*4882a593Smuzhiyun .byte 0,12,0x14,0,0,0,2,0 158*4882a593Smuzhiyun .long 0 159*4882a593Smuzhiyun.size .gcm_gmult_p8,.-.gcm_gmult_p8 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun.globl .gcm_ghash_p8 162*4882a593Smuzhiyun lis r0,0xfff8 163*4882a593Smuzhiyun li r8,0x10 164*4882a593Smuzhiyun mfspr $vrsave,256 165*4882a593Smuzhiyun li r9,0x20 166*4882a593Smuzhiyun mtspr 256,r0 167*4882a593Smuzhiyun li r10,0x30 168*4882a593Smuzhiyun lvx_u $Xl,0,$Xip # load Xi 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun lvx_u $Hl,r8,$Htbl # load pre-computed table 171*4882a593Smuzhiyun le?lvsl $lemask,r0,r0 172*4882a593Smuzhiyun lvx_u $H, r9,$Htbl 173*4882a593Smuzhiyun le?vspltisb $t0,0x07 174*4882a593Smuzhiyun lvx_u $Hh,r10,$Htbl 175*4882a593Smuzhiyun le?vxor $lemask,$lemask,$t0 176*4882a593Smuzhiyun lvx_u $xC2,0,$Htbl 177*4882a593Smuzhiyun le?vperm $Xl,$Xl,$Xl,$lemask 178*4882a593Smuzhiyun vxor $zero,$zero,$zero 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun lvx_u $IN,0,$inp 181*4882a593Smuzhiyun addi $inp,$inp,16 182*4882a593Smuzhiyun subi $len,$len,16 183*4882a593Smuzhiyun le?vperm $IN,$IN,$IN,$lemask 184*4882a593Smuzhiyun vxor $IN,$IN,$Xl 185*4882a593Smuzhiyun b Loop 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun.align 5 188*4882a593SmuzhiyunLoop: 189*4882a593Smuzhiyun subic $len,$len,16 190*4882a593Smuzhiyun vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 191*4882a593Smuzhiyun subfe. r0,r0,r0 # borrow?-1:0 192*4882a593Smuzhiyun vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 193*4882a593Smuzhiyun and r0,r0,$len 194*4882a593Smuzhiyun vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 195*4882a593Smuzhiyun add $inp,$inp,r0 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun vpmsumd $t2,$Xl,$xC2 # 1st phase 198*4882a593Smuzhiyun 199*4882a593Smuzhiyun vsldoi $t0,$Xm,$zero,8 200*4882a593Smuzhiyun vsldoi $t1,$zero,$Xm,8 201*4882a593Smuzhiyun vxor $Xl,$Xl,$t0 202*4882a593Smuzhiyun vxor $Xh,$Xh,$t1 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun vsldoi $Xl,$Xl,$Xl,8 205*4882a593Smuzhiyun vxor $Xl,$Xl,$t2 206*4882a593Smuzhiyun lvx_u $IN,0,$inp 207*4882a593Smuzhiyun addi $inp,$inp,16 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun vsldoi $t1,$Xl,$Xl,8 # 2nd phase 210*4882a593Smuzhiyun vpmsumd $Xl,$Xl,$xC2 211*4882a593Smuzhiyun le?vperm $IN,$IN,$IN,$lemask 212*4882a593Smuzhiyun vxor $t1,$t1,$Xh 213*4882a593Smuzhiyun vxor $IN,$IN,$t1 214*4882a593Smuzhiyun vxor $IN,$IN,$Xl 215*4882a593Smuzhiyun beq Loop # did $len-=16 borrow? 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun vxor $Xl,$Xl,$t1 218*4882a593Smuzhiyun le?vperm $Xl,$Xl,$Xl,$lemask 219*4882a593Smuzhiyun stvx_u $Xl,0,$Xip # write out Xi 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun mtspr 256,$vrsave 222*4882a593Smuzhiyun blr 223*4882a593Smuzhiyun .long 0 224*4882a593Smuzhiyun .byte 0,12,0x14,0,0,0,4,0 225*4882a593Smuzhiyun .long 0 226*4882a593Smuzhiyun.size .gcm_ghash_p8,.-.gcm_ghash_p8 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 229*4882a593Smuzhiyun.align 2 230*4882a593Smuzhiyun___ 231*4882a593Smuzhiyun 232*4882a593Smuzhiyunforeach (split("\n",$code)) { 233*4882a593Smuzhiyun if ($flavour =~ /le$/o) { # little-endian 234*4882a593Smuzhiyun s/le\?//o or 235*4882a593Smuzhiyun s/be\?/#be#/o; 236*4882a593Smuzhiyun } else { 237*4882a593Smuzhiyun s/le\?/#le#/o or 238*4882a593Smuzhiyun s/be\?//o; 239*4882a593Smuzhiyun } 240*4882a593Smuzhiyun print $_,"\n"; 241*4882a593Smuzhiyun} 242*4882a593Smuzhiyun 243*4882a593Smuzhiyunclose STDOUT; # enforce flush 244