Lines Matching +full:8 +full:- +full:ch

2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
22 # - Redistributions of source code must retain the above
26 # - Redistributions in binary form must reproduce the above
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
100 XFER_SIZE = 4*8
101 SRND_SIZE = 1*8
102 INP_SIZE = 1*8
103 INPEND_SIZE = 1*8
104 CTX_SIZE = 1*8
105 RSPSAVE_SIZE = 1*8
106 GPRSAVE_SIZE = 5*8
121 # Add reg to mem using reg-mem add and store
160 # YDST = {YSRC1, YSRC2} >> RVAL*8
163 vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
169 # Extract w[t-7]
170 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
171 # Calculate w[t-16] + w[t-7]
172 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
173 # Extract w[t-15]
174 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
178 # Calculate w[t-15] ror 1
180 vpsllq $(64-1), YTMP1, YTMP3
181 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
182 # Calculate w[t-15] shr 7
183 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
188 add frame_XFER(%rsp),h # h = k + w + h # --
190 mov f, y2 # y2 = f # CH
194 xor g, y2 # y2 = f^g # CH
197 and e, y2 # y2 = (f^g)&e # CH
200 add h, d # d = k + w + h + d # --
206 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
211 add y0, y2 # y2 = S1 + CH # --
213 add y1, h # h = k + w + h + S0 # --
215 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
217 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
218 add y3, h # h = t1 + S0 + MAJ # --
224 # Calculate w[t-15] ror 8
225 vpsrlq $8, YTMP1, YTMP2
226 vpsllq $(64-8), YTMP1, YTMP1
227 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
229 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
233 # Add three components, w[t-16], w[t-7] and sigma0
234 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
236 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
238 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
243 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
244 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
250 add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
254 mov f, y2 # y2 = f # CH
257 xor g, y2 # y2 = f^g # CH
263 and e, y2 # y2 = (f^g)&e # CH
264 add h, d # d = k + w + h + d # --
270 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
275 add y0, y2 # y2 = S1 + CH # --
278 add y1, h # h = k + w + h + S0 # --
280 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
281 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
282 add y3, h # h = t1 + S0 + MAJ # --
289 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
290 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
291 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
292 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
293 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
294 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
295 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
296 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
297 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
303 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
307 add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
311 mov f, y2 # y2 = f # CH
312 xor g, y2 # y2 = f^g # CH
316 and e, y2 # y2 = (f^g)&e # CH
319 add h, d # d = k + w + h + d # --
324 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
332 add y0, y2 # y2 = S1 + CH # --
335 add y1, h # h = k + w + h + S0 # --
336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
337 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
339 add y3, h # h = t1 + S0 + MAJ # --
345 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
346 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
347 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
348 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
349 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
350 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
351 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
352 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
353 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
355 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
357 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
365 add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
369 mov f, y2 # y2 = f # CH
372 xor g, y2 # y2 = f^g # CH
376 and e, y2 # y2 = (f^g)&e # CH
377 add h, d # d = k + w + h + d # --
381 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
384 add y0, y2 # y2 = S1 + CH # --
387 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
396 add y1, h # h = k + w + h + S0 # --
397 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
398 add y3, h # h = t1 + S0 + MAJ # --
409 mov f, y2 # y2 = f # CH
412 xor g, y2 # y2 = f^g # CH
416 and e, y2 # y2 = (f^g)&e # CH
420 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
426 add frame_XFER(%rsp), h # h = k + w + h # --
433 add y0, y2 # y2 = S1 + CH # --
435 add h, d # d = k + w + h + d # --
437 add y1, h # h = k + w + h + S0 # --
439 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
445 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
446 mov f, y2 # y2 = f # CH
449 xor g, y2 # y2 = f^g # CH
453 and e, y2 # y2 = (f^g)&e # CH
454 add y3, old_h # h = t1 + S0 + MAJ # --
458 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
464 add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
471 add y0, y2 # y2 = S1 + CH # --
473 add h, d # d = k + w + h + d # --
475 add y1, h # h = k + w + h + S0 # --
477 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
483 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
484 mov f, y2 # y2 = f # CH
487 xor g, y2 # y2 = f^g # CH
491 and e, y2 # y2 = (f^g)&e # CH
492 add y3, old_h # h = t1 + S0 + MAJ # --
496 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
502 add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
509 add y0, y2 # y2 = S1 + CH # --
511 add h, d # d = k + w + h + d # --
513 add y1, h # h = k + w + h + S0 # --
515 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
521 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
522 mov f, y2 # y2 = f # CH
525 xor g, y2 # y2 = f^g # CH
529 and e, y2 # y2 = (f^g)&e # CH
530 add y3, old_h # h = t1 + S0 + MAJ # --
534 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
540 add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
547 add y0, y2 # y2 = S1 + CH # --
550 add h, d # d = k + w + h + d # --
552 add y1, h # h = k + w + h + S0 # --
554 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
556 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
558 add y3, h # h = t1 + S0 + MAJ # --
576 and $~(0x20 - 1), %rsp
580 mov %rbx, 8*0+frame_GPRSAVE(%rsp)
581 mov %r12, 8*1+frame_GPRSAVE(%rsp)
582 mov %r13, 8*2+frame_GPRSAVE(%rsp)
583 mov %r14, 8*3+frame_GPRSAVE(%rsp)
584 mov %r15, 8*4+frame_GPRSAVE(%rsp)
592 mov 8*0(CTX1), a
593 mov 8*1(CTX1), b
594 mov 8*2(CTX1), c
595 mov 8*3(CTX1), d
596 mov 8*4(CTX1), e
597 mov 8*5(CTX1), f
598 mov 8*6(CTX1), g
599 mov 8*7(CTX1), h
659 addm 8*0(CTX2), a
660 addm 8*1(CTX2), b
661 addm 8*2(CTX2), c
662 addm 8*3(CTX2), d
663 addm 8*4(CTX2), e
664 addm 8*5(CTX2), f
665 addm 8*6(CTX2), g
666 addm 8*7(CTX2), h
676 mov 8*0+frame_GPRSAVE(%rsp), %rbx
677 mov 8*1+frame_GPRSAVE(%rsp), %r12
678 mov 8*2+frame_GPRSAVE(%rsp), %r13
679 mov 8*3+frame_GPRSAVE(%rsp), %r14
680 mov 8*4+frame_GPRSAVE(%rsp), %r15
691 # Mergeable 640-byte rodata section. This allows linker to merge the table
692 # with other, exactly the same 640-byte fragment of another rodata section
741 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.