Lines Matching +full:4 +full:- +full:ch
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
59 # Add reg to mem using reg-mem add and store
86 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
115 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
162 addl \disp(%rsp, SRND), h # h = k + w + h # --
164 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 mov f, y2 # y2 = f # CH
169 xor g, y2 # y2 = f^g # CH
170 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
173 and e, y2 # y2 = (f^g)&e # CH
176 add h, d # d = k + w + h + d # --
179 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
183 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
189 add y0, y2 # y2 = S1 + CH # --
190 vpslld $(32-7), XTMP1, XTMP3
192 add y1, h # h = k + w + h + S0 # --
194 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
195 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
198 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
199 add y3, h # h = t1 + S0 + MAJ # --
209 offset = \disp + 1*4
210 addl offset(%rsp, SRND), h # h = k + w + h # --
214 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 mov f, y2 # y2 = f # CH
218 xor g, y2 # y2 = f^g # CH
224 and e, y2 # y2 = (f^g)&e # CH
225 add h, d # d = k + w + h + d # --
227 vpslld $(32-18), XTMP1, XTMP1
233 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
235 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
239 add y0, y2 # y2 = S1 + CH # --
242 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
244 add y1, h # h = k + w + h + S0 # --
246 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
247 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
248 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
249 add y3, h # h = t1 + S0 + MAJ # --
251 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
260 offset = \disp + 2*4
261 addl offset(%rsp, SRND), h # h = k + w + h # --
263 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
266 mov f, y2 # y2 = f # CH
267 xor g, y2 # y2 = f^g # CH
271 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
272 and e, y2 # y2 = (f^g)&e # CH
276 add h, d # d = k + w + h + d # --
282 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
292 add y0, y2 # y2 = S1 + CH # --
293 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
296 add y1,h # h = k + w + h + S0 # --
297 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
298 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
300 add y3,h # h = t1 + S0 + MAJ # --
310 offset = \disp + 3*4
311 addl offset(%rsp, SRND), h # h = k + w + h # --
315 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
316 mov f, y2 # y2 = f # CH
319 xor g, y2 # y2 = f^g # CH
322 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
324 and e, y2 # y2 = (f^g)&e # CH
325 add h, d # d = k + w + h + d # --
328 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
330 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
334 add y0, y2 # y2 = S1 + CH # --
338 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
349 add y1, h # h = k + w + h + S0 # --
350 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
351 add y3, h # h = t1 + S0 + MAJ # --
360 mov f, y2 # y2 = f # CH
363 xor g, y2 # y2 = f^g # CH
367 and e, y2 # y2 = (f^g)&e # CH
371 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
377 addl \disp(%rsp, SRND), h # h = k + w + h # --
384 add y0, y2 # y2 = S1 + CH # --
387 add h, d # d = k + w + h + d # --
389 add y1, h # h = k + w + h + S0 # --
390 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
396 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
397 mov f, y2 # y2 = f # CH
400 xor g, y2 # y2 = f^g # CH
404 and e, y2 # y2 = (f^g)&e # CH
405 add y3, old_h # h = t1 + S0 + MAJ # --
409 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
415 offset = 4*1 + \disp
416 addl offset(%rsp, SRND), h # h = k + w + h # --
423 add y0, y2 # y2 = S1 + CH # --
426 add h, d # d = k + w + h + d # --
428 add y1, h # h = k + w + h + S0 # --
430 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
436 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
437 mov f, y2 # y2 = f # CH
440 xor g, y2 # y2 = f^g # CH
444 and e, y2 # y2 = (f^g)&e # CH
445 add y3, old_h # h = t1 + S0 + MAJ # --
449 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
455 offset = 4*2 + \disp
456 addl offset(%rsp, SRND), h # h = k + w + h # --
463 add y0, y2 # y2 = S1 + CH # --
466 add h, d # d = k + w + h + d # --
468 add y1, h # h = k + w + h + S0 # --
470 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
476 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
477 mov f, y2 # y2 = f # CH
480 xor g, y2 # y2 = f^g # CH
484 and e, y2 # y2 = (f^g)&e # CH
485 add y3, old_h # h = t1 + S0 + MAJ # --
489 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
495 offset = 4*3 + \disp
496 addl offset(%rsp, SRND), h # h = k + w + h # --
503 add y0, y2 # y2 = S1 + CH # --
506 add h, d # d = k + w + h + d # --
508 add y1, h # h = k + w + h + S0 # --
510 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
513 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
515 add y3, h # h = t1 + S0 + MAJ # --
538 and $-32, %rsp # align rsp to 32 byte boundary
544 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
552 mov 4*1(CTX), b
553 mov 4*2(CTX), c
554 mov 4*3(CTX), d
555 mov 4*4(CTX), e
556 mov 4*5(CTX), f
557 mov 4*6(CTX), g
558 mov 4*7(CTX), h
610 add $4*32, SRND
611 cmp $3*4*32, SRND
628 cmp $4*4*32, SRND
634 addm (4*0)(CTX),a
635 addm (4*1)(CTX),b
636 addm (4*2)(CTX),c
637 addm (4*3)(CTX),d
638 addm (4*4)(CTX),e
639 addm (4*5)(CTX),f
640 addm (4*6)(CTX),g
641 addm (4*7)(CTX),h
653 cmp $4*4*32, SRND
660 addm (4*0)(CTX),a
661 addm (4*1)(CTX),b
662 addm (4*2)(CTX),c
663 addm (4*3)(CTX),d
664 addm (4*4)(CTX),e
665 addm (4*5)(CTX),f
666 addm (4*6)(CTX),g
667 addm (4*7)(CTX),h
689 mov (4*0)(CTX),a
690 mov (4*1)(CTX),b
691 mov (4*2)(CTX),c
692 mov (4*3)(CTX),d
693 mov (4*4)(CTX),e
694 mov (4*5)(CTX),f
695 mov (4*6)(CTX),g
696 mov (4*7)(CTX),h
758 # shuffle xBxA -> 00BA
764 # shuffle xDxC -> DC00