1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-memchr.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Finds characters in a memory area. Optimized for the Alpha: 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * - memory accessed as aligned quadwords only 10*4882a593Smuzhiyun * - uses cmpbge to compare 8 bytes in parallel 11*4882a593Smuzhiyun * - does binary search to find 0 byte in last 12*4882a593Smuzhiyun * quadword (HAKMEM needed 12 instructions to 13*4882a593Smuzhiyun * do this instead of the 9 instructions that 14*4882a593Smuzhiyun * binary search needs). 15*4882a593Smuzhiyun * 16*4882a593Smuzhiyun * For correctness consider that: 17*4882a593Smuzhiyun * 18*4882a593Smuzhiyun * - only minimum number of quadwords may be accessed 19*4882a593Smuzhiyun * - the third argument is an unsigned long 20*4882a593Smuzhiyun * 21*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 22*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 23*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 24*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 25*4882a593Smuzhiyun * Scheduling notation: 26*4882a593Smuzhiyun * E - either cluster 27*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 28*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 29*4882a593Smuzhiyun * Try not to change the actual algorithm if possible for consistency. 30*4882a593Smuzhiyun */ 31*4882a593Smuzhiyun#include <asm/export.h> 32*4882a593Smuzhiyun .set noreorder 33*4882a593Smuzhiyun .set noat 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun .align 4 36*4882a593Smuzhiyun .globl memchr 37*4882a593Smuzhiyun .ent memchr 38*4882a593Smuzhiyunmemchr: 39*4882a593Smuzhiyun .frame $30,0,$26,0 40*4882a593Smuzhiyun .prologue 0 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun # Hack -- if someone passes in (size_t)-1, hoping to just 43*4882a593Smuzhiyun # search til the end of the address space, we will overflow 44*4882a593Smuzhiyun # below when we find the address of the last byte. Given 45*4882a593Smuzhiyun # that we will never have a 56-bit address space, cropping 46*4882a593Smuzhiyun # the length is the easiest way to avoid trouble. 47*4882a593Smuzhiyun zap $18, 0x80, $5 # U : Bound length 48*4882a593Smuzhiyun beq $18, $not_found # U : 49*4882a593Smuzhiyun ldq_u $1, 0($16) # L : load first quadword Latency=3 50*4882a593Smuzhiyun and $17, 0xff, $17 # E : L L U U : 00000000000000ch 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun insbl $17, 1, $2 # U : 000000000000ch00 53*4882a593Smuzhiyun cmpult $18, 9, $4 # E : small (< 1 quad) string? 54*4882a593Smuzhiyun or $2, $17, $17 # E : 000000000000chch 55*4882a593Smuzhiyun lda $3, -1($31) # E : U L L U 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun sll $17, 16, $2 # U : 00000000chch0000 58*4882a593Smuzhiyun addq $16, $5, $5 # E : Max search address 59*4882a593Smuzhiyun or $2, $17, $17 # E : 00000000chchchch 60*4882a593Smuzhiyun sll $17, 32, $2 # U : U L L U : chchchch00000000 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun or $2, $17, $17 # E : chchchchchchchch 63*4882a593Smuzhiyun extql $1, $16, $7 # U : $7 is upper bits 64*4882a593Smuzhiyun beq $4, $first_quad # U : 65*4882a593Smuzhiyun ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun extqh $6, $16, $6 # U : 2 cycle stall for $6 68*4882a593Smuzhiyun mov $16, $0 # E : 69*4882a593Smuzhiyun nop # E : 70*4882a593Smuzhiyun or $7, $6, $1 # E : L U L U $1 = quadword starting at $16 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun # Deal with the case where at most 8 bytes remain to be searched 73*4882a593Smuzhiyun # in $1. E.g.: 74*4882a593Smuzhiyun # $18 = 6 75*4882a593Smuzhiyun # $1 = ????c6c5c4c3c2c1 76*4882a593Smuzhiyun$last_quad: 77*4882a593Smuzhiyun negq $18, $6 # E : 78*4882a593Smuzhiyun xor $17, $1, $1 # E : 79*4882a593Smuzhiyun srl $3, $6, $6 # U : $6 = mask of $18 bits set 80*4882a593Smuzhiyun cmpbge $31, $1, $2 # E : L U L U 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun nop 83*4882a593Smuzhiyun nop 84*4882a593Smuzhiyun and $2, $6, $2 # E : 85*4882a593Smuzhiyun beq $2, $not_found # U : U L U L 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun$found_it: 88*4882a593Smuzhiyun#ifdef CONFIG_ALPHA_EV67 89*4882a593Smuzhiyun /* 90*4882a593Smuzhiyun * Since we are guaranteed to have set one of the bits, we don't 91*4882a593Smuzhiyun * have to worry about coming back with a 0x40 out of cttz... 92*4882a593Smuzhiyun */ 93*4882a593Smuzhiyun cttz $2, $3 # U0 : 94*4882a593Smuzhiyun addq $0, $3, $0 # E : All done 95*4882a593Smuzhiyun nop # E : 96*4882a593Smuzhiyun ret # L0 : L U L U 97*4882a593Smuzhiyun#else 98*4882a593Smuzhiyun /* 99*4882a593Smuzhiyun * Slow and clunky. It can probably be improved. 100*4882a593Smuzhiyun * An exercise left for others. 101*4882a593Smuzhiyun */ 102*4882a593Smuzhiyun negq $2, $3 # E : 103*4882a593Smuzhiyun and $2, $3, $2 # E : 104*4882a593Smuzhiyun and $2, 0x0f, $1 # E : 105*4882a593Smuzhiyun addq $0, 4, $3 # E : 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 108*4882a593Smuzhiyun nop # E : keep with cmov 109*4882a593Smuzhiyun and $2, 0x33, $1 # E : 110*4882a593Smuzhiyun addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 113*4882a593Smuzhiyun nop # E : keep with cmov 114*4882a593Smuzhiyun and $2, 0x55, $1 # E : 115*4882a593Smuzhiyun addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 118*4882a593Smuzhiyun nop 119*4882a593Smuzhiyun nop 120*4882a593Smuzhiyun ret # L0 : L U L U 121*4882a593Smuzhiyun#endif 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun # Deal with the case where $18 > 8 bytes remain to be 124*4882a593Smuzhiyun # searched. $16 may not be aligned. 125*4882a593Smuzhiyun .align 4 126*4882a593Smuzhiyun$first_quad: 127*4882a593Smuzhiyun andnot $16, 0x7, $0 # E : 128*4882a593Smuzhiyun insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff) 129*4882a593Smuzhiyun xor $1, $17, $1 # E : 130*4882a593Smuzhiyun or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun cmpbge $31, $1, $2 # E : 133*4882a593Smuzhiyun bne $2, $found_it # U : 134*4882a593Smuzhiyun # At least one byte left to process. 135*4882a593Smuzhiyun ldq $1, 8($0) # L : 136*4882a593Smuzhiyun subq $5, 1, $18 # E : U L U L 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun addq $0, 8, $0 # E : 139*4882a593Smuzhiyun # Make $18 point to last quad to be accessed (the 140*4882a593Smuzhiyun # last quad may or may not be partial). 141*4882a593Smuzhiyun andnot $18, 0x7, $18 # E : 142*4882a593Smuzhiyun cmpult $0, $18, $2 # E : 143*4882a593Smuzhiyun beq $2, $final # U : U L U L 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun # At least two quads remain to be accessed. 146*4882a593Smuzhiyun 147*4882a593Smuzhiyun subq $18, $0, $4 # E : $4 <- nr quads to be processed 148*4882a593Smuzhiyun and $4, 8, $4 # E : odd number of quads? 149*4882a593Smuzhiyun bne $4, $odd_quad_count # U : 150*4882a593Smuzhiyun # At least three quads remain to be accessed 151*4882a593Smuzhiyun mov $1, $4 # E : L U L U : move prefetched value to correct reg 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun .align 4 154*4882a593Smuzhiyun$unrolled_loop: 155*4882a593Smuzhiyun ldq $1, 8($0) # L : prefetch $1 156*4882a593Smuzhiyun xor $17, $4, $2 # E : 157*4882a593Smuzhiyun cmpbge $31, $2, $2 # E : 158*4882a593Smuzhiyun bne $2, $found_it # U : U L U L 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun addq $0, 8, $0 # E : 161*4882a593Smuzhiyun nop # E : 162*4882a593Smuzhiyun nop # E : 163*4882a593Smuzhiyun nop # E : 164*4882a593Smuzhiyun 165*4882a593Smuzhiyun$odd_quad_count: 166*4882a593Smuzhiyun xor $17, $1, $2 # E : 167*4882a593Smuzhiyun ldq $4, 8($0) # L : prefetch $4 168*4882a593Smuzhiyun cmpbge $31, $2, $2 # E : 169*4882a593Smuzhiyun addq $0, 8, $6 # E : 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun bne $2, $found_it # U : 172*4882a593Smuzhiyun cmpult $6, $18, $6 # E : 173*4882a593Smuzhiyun addq $0, 8, $0 # E : 174*4882a593Smuzhiyun nop # E : 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun bne $6, $unrolled_loop # U : 177*4882a593Smuzhiyun mov $4, $1 # E : move prefetched value into $1 178*4882a593Smuzhiyun nop # E : 179*4882a593Smuzhiyun nop # E : 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do 182*4882a593Smuzhiyun nop # E : 183*4882a593Smuzhiyun nop # E : 184*4882a593Smuzhiyun bne $18, $last_quad # U : 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun$not_found: 187*4882a593Smuzhiyun mov $31, $0 # E : 188*4882a593Smuzhiyun nop # E : 189*4882a593Smuzhiyun nop # E : 190*4882a593Smuzhiyun ret # L0 : 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun .end memchr 193*4882a593Smuzhiyun EXPORT_SYMBOL(memchr) 194