xref: /OK3568_Linux_fs/kernel/arch/powerpc/crypto/aes-spe-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Fast AES implementation for SPE instruction set (PPC)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This code makes use of the SPE SIMD instruction set as defined in
6*4882a593Smuzhiyun * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7*4882a593Smuzhiyun * Implementation is based on optimization guide notes from
8*4882a593Smuzhiyun * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <asm/ppc_asm.h>
14*4882a593Smuzhiyun#include "aes-spe-regs.h"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun#define	EAD(in, bpos) \
17*4882a593Smuzhiyun	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun#define DAD(in, bpos) \
20*4882a593Smuzhiyun	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun#define LWH(out, off) \
23*4882a593Smuzhiyun	evlwwsplat	out,off(rT0);	/* load word high		*/
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun#define LWL(out, off) \
26*4882a593Smuzhiyun	lwz		out,off(rT0);	/* load word low		*/
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#define LBZ(out, tab, off) \
29*4882a593Smuzhiyun	lbz		out,off(tab);	/* load byte			*/
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun#define LAH(out, in, bpos, off) \
32*4882a593Smuzhiyun	EAD(in, bpos)			/* calc addr + load word high	*/ \
33*4882a593Smuzhiyun	LWH(out, off)
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#define LAL(out, in, bpos, off) \
36*4882a593Smuzhiyun	EAD(in, bpos)			/* calc addr + load word low	*/ \
37*4882a593Smuzhiyun	LWL(out, off)
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define LAE(out, in, bpos) \
40*4882a593Smuzhiyun	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
41*4882a593Smuzhiyun	LBZ(out, rT0, 8)
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun#define LBE(out) \
44*4882a593Smuzhiyun	LBZ(out, rT0, 8)		/* load enc byte		*/
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun#define LAD(out, in, bpos) \
47*4882a593Smuzhiyun	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
48*4882a593Smuzhiyun	LBZ(out, rT1, 0)
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#define LBD(out) \
51*4882a593Smuzhiyun	LBZ(out, rT1, 0)
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun/*
54*4882a593Smuzhiyun * ppc_encrypt_block: The central encryption function for a single 16 bytes
55*4882a593Smuzhiyun * block. It does no stack handling or register saving to support fast calls
56*4882a593Smuzhiyun * via bl/blr. It expects that caller has pre-xored input data with first
57*4882a593Smuzhiyun * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
58*4882a593Smuzhiyun * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
59*4882a593Smuzhiyun * and rW0-rW3 and caller must execute a final xor on the output registers.
60*4882a593Smuzhiyun * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
61*4882a593Smuzhiyun *
62*4882a593Smuzhiyun */
63*4882a593Smuzhiyun_GLOBAL(ppc_encrypt_block)
64*4882a593Smuzhiyun	LAH(rW4, rD1, 2, 4)
65*4882a593Smuzhiyun	LAH(rW6, rD0, 3, 0)
66*4882a593Smuzhiyun	LAH(rW3, rD0, 1, 8)
67*4882a593Smuzhiyunppc_encrypt_block_loop:
68*4882a593Smuzhiyun	LAH(rW0, rD3, 0, 12)
69*4882a593Smuzhiyun	LAL(rW0, rD0, 0, 12)
70*4882a593Smuzhiyun	LAH(rW1, rD1, 0, 12)
71*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
72*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
73*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
74*4882a593Smuzhiyun	LAL(rW4, rD2, 2, 4)
75*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
76*4882a593Smuzhiyun	LAH(rW5, rD3, 2, 4)
77*4882a593Smuzhiyun	LAL(rW5, rD0, 2, 4)
78*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
79*4882a593Smuzhiyun	evldw		rD1,16(rKP)
80*4882a593Smuzhiyun	EAD(rD3, 3)
81*4882a593Smuzhiyun	evxor		rW2,rW2,rW4
82*4882a593Smuzhiyun	LWL(rW7, 0)
83*4882a593Smuzhiyun	evxor		rW2,rW2,rW6
84*4882a593Smuzhiyun	EAD(rD2, 0)
85*4882a593Smuzhiyun	evxor		rD1,rD1,rW2
86*4882a593Smuzhiyun	LWL(rW1, 12)
87*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
88*4882a593Smuzhiyun	evldw		rD3,24(rKP)
89*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
90*4882a593Smuzhiyun	EAD(rD1, 2)
91*4882a593Smuzhiyun	evxor		rW3,rW3,rW5
92*4882a593Smuzhiyun	LWH(rW4, 4)
93*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
94*4882a593Smuzhiyun	EAD(rD0, 3)
95*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
96*4882a593Smuzhiyun	LWH(rW6, 0)
97*4882a593Smuzhiyun	evxor		rD3,rD3,rW1
98*4882a593Smuzhiyun	EAD(rD0, 1)
99*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
100*4882a593Smuzhiyun	LWH(rW3, 8)
101*4882a593Smuzhiyun	LAH(rW0, rD3, 0, 12)
102*4882a593Smuzhiyun	LAL(rW0, rD0, 0, 12)
103*4882a593Smuzhiyun	LAH(rW1, rD1, 0, 12)
104*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
105*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
106*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
107*4882a593Smuzhiyun	LAL(rW4, rD2, 2, 4)
108*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
109*4882a593Smuzhiyun	LAH(rW5, rD3, 2, 4)
110*4882a593Smuzhiyun	LAL(rW5, rD0, 2, 4)
111*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
112*4882a593Smuzhiyun	evldw		rD1,32(rKP)
113*4882a593Smuzhiyun	EAD(rD3, 3)
114*4882a593Smuzhiyun	evxor		rW2,rW2,rW4
115*4882a593Smuzhiyun	LWL(rW7, 0)
116*4882a593Smuzhiyun	evxor		rW2,rW2,rW6
117*4882a593Smuzhiyun	EAD(rD2, 0)
118*4882a593Smuzhiyun	evxor		rD1,rD1,rW2
119*4882a593Smuzhiyun	LWL(rW1, 12)
120*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
121*4882a593Smuzhiyun	evldw		rD3,40(rKP)
122*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
123*4882a593Smuzhiyun	EAD(rD1, 2)
124*4882a593Smuzhiyun	evxor		rW3,rW3,rW5
125*4882a593Smuzhiyun	LWH(rW4, 4)
126*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
127*4882a593Smuzhiyun	EAD(rD0, 3)
128*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
129*4882a593Smuzhiyun	LWH(rW6, 0)
130*4882a593Smuzhiyun	evxor		rD3,rD3,rW1
131*4882a593Smuzhiyun	EAD(rD0, 1)
132*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
133*4882a593Smuzhiyun	LWH(rW3, 8)
134*4882a593Smuzhiyun	addi		rKP,rKP,32
135*4882a593Smuzhiyun	bdnz		ppc_encrypt_block_loop
136*4882a593Smuzhiyun	LAH(rW0, rD3, 0, 12)
137*4882a593Smuzhiyun	LAL(rW0, rD0, 0, 12)
138*4882a593Smuzhiyun	LAH(rW1, rD1, 0, 12)
139*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
140*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
141*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
142*4882a593Smuzhiyun	LAL(rW4, rD2, 2, 4)
143*4882a593Smuzhiyun	LAH(rW5, rD3, 2, 4)
144*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
145*4882a593Smuzhiyun	LAL(rW5, rD0, 2, 4)
146*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
147*4882a593Smuzhiyun	evldw		rD1,16(rKP)
148*4882a593Smuzhiyun	EAD(rD3, 3)
149*4882a593Smuzhiyun	evxor		rW2,rW2,rW4
150*4882a593Smuzhiyun	LWL(rW7, 0)
151*4882a593Smuzhiyun	evxor		rW2,rW2,rW6
152*4882a593Smuzhiyun	EAD(rD2, 0)
153*4882a593Smuzhiyun	evxor		rD1,rD1,rW2
154*4882a593Smuzhiyun	LWL(rW1, 12)
155*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
156*4882a593Smuzhiyun	evldw		rD3,24(rKP)
157*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
158*4882a593Smuzhiyun	EAD(rD1, 0)
159*4882a593Smuzhiyun	evxor		rW3,rW3,rW5
160*4882a593Smuzhiyun	LBE(rW2)
161*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
162*4882a593Smuzhiyun	EAD(rD0, 1)
163*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
164*4882a593Smuzhiyun	LBE(rW6)
165*4882a593Smuzhiyun	evxor		rD3,rD3,rW1
166*4882a593Smuzhiyun	EAD(rD0, 0)
167*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
168*4882a593Smuzhiyun	LBE(rW1)
169*4882a593Smuzhiyun	LAE(rW0, rD3, 0)
170*4882a593Smuzhiyun	LAE(rW1, rD0, 0)
171*4882a593Smuzhiyun	LAE(rW4, rD2, 1)
172*4882a593Smuzhiyun	LAE(rW5, rD3, 1)
173*4882a593Smuzhiyun	LAE(rW3, rD2, 0)
174*4882a593Smuzhiyun	LAE(rW7, rD1, 1)
175*4882a593Smuzhiyun	rlwimi		rW0,rW4,8,16,23
176*4882a593Smuzhiyun	rlwimi		rW1,rW5,8,16,23
177*4882a593Smuzhiyun	LAE(rW4, rD1, 2)
178*4882a593Smuzhiyun	LAE(rW5, rD2, 2)
179*4882a593Smuzhiyun	rlwimi		rW2,rW6,8,16,23
180*4882a593Smuzhiyun	rlwimi		rW3,rW7,8,16,23
181*4882a593Smuzhiyun	LAE(rW6, rD3, 2)
182*4882a593Smuzhiyun	LAE(rW7, rD0, 2)
183*4882a593Smuzhiyun	rlwimi		rW0,rW4,16,8,15
184*4882a593Smuzhiyun	rlwimi		rW1,rW5,16,8,15
185*4882a593Smuzhiyun	LAE(rW4, rD0, 3)
186*4882a593Smuzhiyun	LAE(rW5, rD1, 3)
187*4882a593Smuzhiyun	rlwimi		rW2,rW6,16,8,15
188*4882a593Smuzhiyun	lwz		rD0,32(rKP)
189*4882a593Smuzhiyun	rlwimi		rW3,rW7,16,8,15
190*4882a593Smuzhiyun	lwz		rD1,36(rKP)
191*4882a593Smuzhiyun	LAE(rW6, rD2, 3)
192*4882a593Smuzhiyun	LAE(rW7, rD3, 3)
193*4882a593Smuzhiyun	rlwimi		rW0,rW4,24,0,7
194*4882a593Smuzhiyun	lwz		rD2,40(rKP)
195*4882a593Smuzhiyun	rlwimi		rW1,rW5,24,0,7
196*4882a593Smuzhiyun	lwz		rD3,44(rKP)
197*4882a593Smuzhiyun	rlwimi		rW2,rW6,24,0,7
198*4882a593Smuzhiyun	rlwimi		rW3,rW7,24,0,7
199*4882a593Smuzhiyun	blr
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun/*
202*4882a593Smuzhiyun * ppc_decrypt_block: The central decryption function for a single 16 bytes
203*4882a593Smuzhiyun * block. It does no stack handling or register saving to support fast calls
204*4882a593Smuzhiyun * via bl/blr. It expects that caller has pre-xored input data with first
205*4882a593Smuzhiyun * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
206*4882a593Smuzhiyun * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
207*4882a593Smuzhiyun * and rW0-rW3 and caller must execute a final xor on the output registers.
208*4882a593Smuzhiyun * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
209*4882a593Smuzhiyun *
210*4882a593Smuzhiyun */
211*4882a593Smuzhiyun_GLOBAL(ppc_decrypt_block)
212*4882a593Smuzhiyun	LAH(rW0, rD1, 0, 12)
213*4882a593Smuzhiyun	LAH(rW6, rD0, 3, 0)
214*4882a593Smuzhiyun	LAH(rW3, rD0, 1, 8)
215*4882a593Smuzhiyunppc_decrypt_block_loop:
216*4882a593Smuzhiyun	LAH(rW1, rD3, 0, 12)
217*4882a593Smuzhiyun	LAL(rW0, rD2, 0, 12)
218*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
219*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
220*4882a593Smuzhiyun	LAH(rW4, rD3, 2, 4)
221*4882a593Smuzhiyun	LAL(rW4, rD0, 2, 4)
222*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
223*4882a593Smuzhiyun	LAH(rW5, rD1, 2, 4)
224*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
225*4882a593Smuzhiyun	LAL(rW7, rD3, 3, 0)
226*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
227*4882a593Smuzhiyun	evldw		rD1,16(rKP)
228*4882a593Smuzhiyun	EAD(rD0, 0)
229*4882a593Smuzhiyun	evxor		rW4,rW4,rW6
230*4882a593Smuzhiyun	LWL(rW1, 12)
231*4882a593Smuzhiyun	evxor		rW0,rW0,rW4
232*4882a593Smuzhiyun	EAD(rD2, 2)
233*4882a593Smuzhiyun	evxor		rW0,rW0,rW2
234*4882a593Smuzhiyun	LWL(rW5, 4)
235*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
236*4882a593Smuzhiyun	evldw		rD3,24(rKP)
237*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
238*4882a593Smuzhiyun	EAD(rD1, 0)
239*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
240*4882a593Smuzhiyun	LWH(rW0, 12)
241*4882a593Smuzhiyun	evxor		rW3,rW3,rW1
242*4882a593Smuzhiyun	EAD(rD0, 3)
243*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
244*4882a593Smuzhiyun	LWH(rW6, 0)
245*4882a593Smuzhiyun	evxor		rD3,rD3,rW5
246*4882a593Smuzhiyun	EAD(rD0, 1)
247*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
248*4882a593Smuzhiyun	LWH(rW3, 8)
249*4882a593Smuzhiyun	LAH(rW1, rD3, 0, 12)
250*4882a593Smuzhiyun	LAL(rW0, rD2, 0, 12)
251*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
252*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
253*4882a593Smuzhiyun	LAH(rW4, rD3, 2, 4)
254*4882a593Smuzhiyun	LAL(rW4, rD0, 2, 4)
255*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
256*4882a593Smuzhiyun	LAH(rW5, rD1, 2, 4)
257*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
258*4882a593Smuzhiyun	LAL(rW7, rD3, 3, 0)
259*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
260*4882a593Smuzhiyun	evldw		 rD1,32(rKP)
261*4882a593Smuzhiyun	EAD(rD0, 0)
262*4882a593Smuzhiyun	evxor		rW4,rW4,rW6
263*4882a593Smuzhiyun	LWL(rW1, 12)
264*4882a593Smuzhiyun	evxor		rW0,rW0,rW4
265*4882a593Smuzhiyun	EAD(rD2, 2)
266*4882a593Smuzhiyun	evxor		rW0,rW0,rW2
267*4882a593Smuzhiyun	LWL(rW5, 4)
268*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
269*4882a593Smuzhiyun	evldw		rD3,40(rKP)
270*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
271*4882a593Smuzhiyun	EAD(rD1, 0)
272*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
273*4882a593Smuzhiyun	LWH(rW0, 12)
274*4882a593Smuzhiyun	evxor		rW3,rW3,rW1
275*4882a593Smuzhiyun	EAD(rD0, 3)
276*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
277*4882a593Smuzhiyun	LWH(rW6, 0)
278*4882a593Smuzhiyun	evxor		rD3,rD3,rW5
279*4882a593Smuzhiyun	EAD(rD0, 1)
280*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
281*4882a593Smuzhiyun	LWH(rW3, 8)
282*4882a593Smuzhiyun	addi		rKP,rKP,32
283*4882a593Smuzhiyun	bdnz		ppc_decrypt_block_loop
284*4882a593Smuzhiyun	LAH(rW1, rD3, 0, 12)
285*4882a593Smuzhiyun	LAL(rW0, rD2, 0, 12)
286*4882a593Smuzhiyun	LAH(rW2, rD2, 1, 8)
287*4882a593Smuzhiyun	LAL(rW2, rD3, 1, 8)
288*4882a593Smuzhiyun	LAH(rW4, rD3, 2, 4)
289*4882a593Smuzhiyun	LAL(rW4, rD0, 2, 4)
290*4882a593Smuzhiyun	LAL(rW6, rD1, 3, 0)
291*4882a593Smuzhiyun	LAH(rW5, rD1, 2, 4)
292*4882a593Smuzhiyun	LAH(rW7, rD2, 3, 0)
293*4882a593Smuzhiyun	LAL(rW7, rD3, 3, 0)
294*4882a593Smuzhiyun	LAL(rW3, rD1, 1, 8)
295*4882a593Smuzhiyun	evldw		 rD1,16(rKP)
296*4882a593Smuzhiyun	EAD(rD0, 0)
297*4882a593Smuzhiyun	evxor		rW4,rW4,rW6
298*4882a593Smuzhiyun	LWL(rW1, 12)
299*4882a593Smuzhiyun	evxor		rW0,rW0,rW4
300*4882a593Smuzhiyun	EAD(rD2, 2)
301*4882a593Smuzhiyun	evxor		rW0,rW0,rW2
302*4882a593Smuzhiyun	LWL(rW5, 4)
303*4882a593Smuzhiyun	evxor		rD1,rD1,rW0
304*4882a593Smuzhiyun	evldw		rD3,24(rKP)
305*4882a593Smuzhiyun	evmergehi	rD0,rD0,rD1
306*4882a593Smuzhiyun	DAD(rD1, 0)
307*4882a593Smuzhiyun	evxor		rW3,rW3,rW7
308*4882a593Smuzhiyun	LBD(rW0)
309*4882a593Smuzhiyun	evxor		rW3,rW3,rW1
310*4882a593Smuzhiyun	DAD(rD0, 1)
311*4882a593Smuzhiyun	evxor		rD3,rD3,rW3
312*4882a593Smuzhiyun	LBD(rW6)
313*4882a593Smuzhiyun	evxor		rD3,rD3,rW5
314*4882a593Smuzhiyun	DAD(rD0, 0)
315*4882a593Smuzhiyun	evmergehi	rD2,rD2,rD3
316*4882a593Smuzhiyun	LBD(rW3)
317*4882a593Smuzhiyun	LAD(rW2, rD3, 0)
318*4882a593Smuzhiyun	LAD(rW1, rD2, 0)
319*4882a593Smuzhiyun	LAD(rW4, rD2, 1)
320*4882a593Smuzhiyun	LAD(rW5, rD3, 1)
321*4882a593Smuzhiyun	LAD(rW7, rD1, 1)
322*4882a593Smuzhiyun	rlwimi		rW0,rW4,8,16,23
323*4882a593Smuzhiyun	rlwimi		rW1,rW5,8,16,23
324*4882a593Smuzhiyun	LAD(rW4, rD3, 2)
325*4882a593Smuzhiyun	LAD(rW5, rD0, 2)
326*4882a593Smuzhiyun	rlwimi		rW2,rW6,8,16,23
327*4882a593Smuzhiyun	rlwimi		rW3,rW7,8,16,23
328*4882a593Smuzhiyun	LAD(rW6, rD1, 2)
329*4882a593Smuzhiyun	LAD(rW7, rD2, 2)
330*4882a593Smuzhiyun	rlwimi		rW0,rW4,16,8,15
331*4882a593Smuzhiyun	rlwimi		rW1,rW5,16,8,15
332*4882a593Smuzhiyun	LAD(rW4, rD0, 3)
333*4882a593Smuzhiyun	LAD(rW5, rD1, 3)
334*4882a593Smuzhiyun	rlwimi		rW2,rW6,16,8,15
335*4882a593Smuzhiyun	lwz		rD0,32(rKP)
336*4882a593Smuzhiyun	rlwimi		rW3,rW7,16,8,15
337*4882a593Smuzhiyun	lwz		rD1,36(rKP)
338*4882a593Smuzhiyun	LAD(rW6, rD2, 3)
339*4882a593Smuzhiyun	LAD(rW7, rD3, 3)
340*4882a593Smuzhiyun	rlwimi		rW0,rW4,24,0,7
341*4882a593Smuzhiyun	lwz		rD2,40(rKP)
342*4882a593Smuzhiyun	rlwimi		rW1,rW5,24,0,7
343*4882a593Smuzhiyun	lwz		rD3,44(rKP)
344*4882a593Smuzhiyun	rlwimi		rW2,rW6,24,0,7
345*4882a593Smuzhiyun	rlwimi		rW3,rW7,24,0,7
346*4882a593Smuzhiyun	blr
347