1*4882a593Smuzhiyun/* Copyright (C) 1996 Free Software Foundation, Inc. 2*4882a593Smuzhiyun This file is part of the GNU C Library. 3*4882a593Smuzhiyun Contributed by David Mosberger (davidm@cs.arizona.edu). 4*4882a593Smuzhiyun 5*4882a593Smuzhiyun The GNU C Library is free software; you can redistribute it and/or 6*4882a593Smuzhiyun modify it under the terms of the GNU Library General Public License as 7*4882a593Smuzhiyun published by the Free Software Foundation; either version 2 of the 8*4882a593Smuzhiyun License, or (at your option) any later version. 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun The GNU C Library is distributed in the hope that it will be useful, 11*4882a593Smuzhiyun but WITHOUT ANY WARRANTY; without even the implied warranty of 12*4882a593Smuzhiyun MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13*4882a593Smuzhiyun Library General Public License for more details. 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun You should have received a copy of the GNU Library General Public 16*4882a593Smuzhiyun License along with the GNU C Library; see the file COPYING.LIB. If not, 17*4882a593Smuzhiyun write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18*4882a593Smuzhiyun Boston, MA 02111-1307, USA. */ 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun/* Finds characters in a memory area. Optimized for the Alpha: 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun - memory accessed as aligned quadwords only 23*4882a593Smuzhiyun - uses cmpbge to compare 8 bytes in parallel 24*4882a593Smuzhiyun - does binary search to find 0 byte in last 25*4882a593Smuzhiyun quadword (HAKMEM needed 12 instructions to 26*4882a593Smuzhiyun do this instead of the 9 instructions that 27*4882a593Smuzhiyun binary search needs). 28*4882a593Smuzhiyun 29*4882a593SmuzhiyunFor correctness consider that: 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun - only minimum number of quadwords may be accessed 32*4882a593Smuzhiyun - the third argument is an unsigned long 33*4882a593Smuzhiyun*/ 34*4882a593Smuzhiyun#include <asm/export.h> 35*4882a593Smuzhiyun .set noreorder 36*4882a593Smuzhiyun .set noat 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun .globl memchr 39*4882a593Smuzhiyun .ent memchr 40*4882a593Smuzhiyunmemchr: 41*4882a593Smuzhiyun .frame $30,0,$26,0 42*4882a593Smuzhiyun .prologue 0 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun # Hack -- if someone passes in (size_t)-1, hoping to just 45*4882a593Smuzhiyun # search til the end of the address space, we will overflow 46*4882a593Smuzhiyun # below when we find the address of the last byte. Given 47*4882a593Smuzhiyun # that we will never have a 56-bit address space, cropping 48*4882a593Smuzhiyun # the length is the easiest way to avoid trouble. 49*4882a593Smuzhiyun zap $18, 0x80, $5 #-e0 : 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun beq $18, $not_found # .. e1 : 52*4882a593Smuzhiyun ldq_u $1, 0($16) # e1 : load first quadword 53*4882a593Smuzhiyun insbl $17, 1, $2 # .. e0 : $2 = 000000000000ch00 54*4882a593Smuzhiyun and $17, 0xff, $17 #-e0 : $17 = 00000000000000ch 55*4882a593Smuzhiyun cmpult $18, 9, $4 # .. e1 : 56*4882a593Smuzhiyun or $2, $17, $17 # e0 : $17 = 000000000000chch 57*4882a593Smuzhiyun lda $3, -1($31) # .. e1 : 58*4882a593Smuzhiyun sll $17, 16, $2 #-e0 : $2 = 00000000chch0000 59*4882a593Smuzhiyun addq $16, $5, $5 # .. e1 : 60*4882a593Smuzhiyun or $2, $17, $17 # e1 : $17 = 00000000chchchch 61*4882a593Smuzhiyun unop # : 62*4882a593Smuzhiyun sll $17, 32, $2 #-e0 : $2 = chchchch00000000 63*4882a593Smuzhiyun or $2, $17, $17 # e1 : $17 = chchchchchchchch 64*4882a593Smuzhiyun extql $1, $16, $7 # e0 : 65*4882a593Smuzhiyun beq $4, $first_quad # .. e1 : 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun ldq_u $6, -1($5) #-e1 : eight or less bytes to search 68*4882a593Smuzhiyun extqh $6, $16, $6 # .. e0 : 69*4882a593Smuzhiyun mov $16, $0 # e0 : 70*4882a593Smuzhiyun or $7, $6, $1 # .. e1 : $1 = quadword starting at $16 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun # Deal with the case where at most 8 bytes remain to be searched 73*4882a593Smuzhiyun # in $1. E.g.: 74*4882a593Smuzhiyun # $18 = 6 75*4882a593Smuzhiyun # $1 = ????c6c5c4c3c2c1 76*4882a593Smuzhiyun$last_quad: 77*4882a593Smuzhiyun negq $18, $6 #-e0 : 78*4882a593Smuzhiyun xor $17, $1, $1 # .. e1 : 79*4882a593Smuzhiyun srl $3, $6, $6 # e0 : $6 = mask of $18 bits set 80*4882a593Smuzhiyun cmpbge $31, $1, $2 # .. e1 : 81*4882a593Smuzhiyun and $2, $6, $2 #-e0 : 82*4882a593Smuzhiyun beq $2, $not_found # .. e1 : 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun$found_it: 85*4882a593Smuzhiyun # Now, determine which byte matched: 86*4882a593Smuzhiyun negq $2, $3 # e0 : 87*4882a593Smuzhiyun and $2, $3, $2 # e1 : 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun and $2, 0x0f, $1 #-e0 : 90*4882a593Smuzhiyun addq $0, 4, $3 # .. e1 : 91*4882a593Smuzhiyun cmoveq $1, $3, $0 # e0 : 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun addq $0, 2, $3 # .. e1 : 94*4882a593Smuzhiyun and $2, 0x33, $1 #-e0 : 95*4882a593Smuzhiyun cmoveq $1, $3, $0 # .. e1 : 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun and $2, 0x55, $1 # e0 : 98*4882a593Smuzhiyun addq $0, 1, $3 # .. e1 : 99*4882a593Smuzhiyun cmoveq $1, $3, $0 #-e0 : 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun$done: ret # .. e1 : 102*4882a593Smuzhiyun 103*4882a593Smuzhiyun # Deal with the case where $18 > 8 bytes remain to be 104*4882a593Smuzhiyun # searched. $16 may not be aligned. 105*4882a593Smuzhiyun .align 4 106*4882a593Smuzhiyun$first_quad: 107*4882a593Smuzhiyun andnot $16, 0x7, $0 #-e1 : 108*4882a593Smuzhiyun insqh $3, $16, $2 # .. e0 : $2 = 0000ffffffffffff ($16<0:2> ff) 109*4882a593Smuzhiyun xor $1, $17, $1 # e0 : 110*4882a593Smuzhiyun or $1, $2, $1 # e1 : $1 = ====ffffffffffff 111*4882a593Smuzhiyun cmpbge $31, $1, $2 #-e0 : 112*4882a593Smuzhiyun bne $2, $found_it # .. e1 : 113*4882a593Smuzhiyun 114*4882a593Smuzhiyun # At least one byte left to process. 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun ldq $1, 8($0) # e0 : 117*4882a593Smuzhiyun subq $5, 1, $18 # .. e1 : 118*4882a593Smuzhiyun addq $0, 8, $0 #-e0 : 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun # Make $18 point to last quad to be accessed (the 121*4882a593Smuzhiyun # last quad may or may not be partial). 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun andnot $18, 0x7, $18 # .. e1 : 124*4882a593Smuzhiyun cmpult $0, $18, $2 # e0 : 125*4882a593Smuzhiyun beq $2, $final # .. e1 : 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun # At least two quads remain to be accessed. 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun subq $18, $0, $4 #-e0 : $4 <- nr quads to be processed 130*4882a593Smuzhiyun and $4, 8, $4 # e1 : odd number of quads? 131*4882a593Smuzhiyun bne $4, $odd_quad_count # e1 : 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun # At least three quads remain to be accessed 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun mov $1, $4 # e0 : move prefetched value to correct reg 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun .align 4 138*4882a593Smuzhiyun$unrolled_loop: 139*4882a593Smuzhiyun ldq $1, 8($0) #-e0 : prefetch $1 140*4882a593Smuzhiyun xor $17, $4, $2 # .. e1 : 141*4882a593Smuzhiyun cmpbge $31, $2, $2 # e0 : 142*4882a593Smuzhiyun bne $2, $found_it # .. e1 : 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun addq $0, 8, $0 #-e0 : 145*4882a593Smuzhiyun$odd_quad_count: 146*4882a593Smuzhiyun xor $17, $1, $2 # .. e1 : 147*4882a593Smuzhiyun ldq $4, 8($0) # e0 : prefetch $4 148*4882a593Smuzhiyun cmpbge $31, $2, $2 # .. e1 : 149*4882a593Smuzhiyun addq $0, 8, $6 #-e0 : 150*4882a593Smuzhiyun bne $2, $found_it # .. e1 : 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun cmpult $6, $18, $6 # e0 : 153*4882a593Smuzhiyun addq $0, 8, $0 # .. e1 : 154*4882a593Smuzhiyun bne $6, $unrolled_loop #-e1 : 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun mov $4, $1 # e0 : move prefetched value into $1 157*4882a593Smuzhiyun$final: subq $5, $0, $18 # .. e1 : $18 <- number of bytes left to do 158*4882a593Smuzhiyun bne $18, $last_quad # e1 : 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun$not_found: 161*4882a593Smuzhiyun mov $31, $0 #-e0 : 162*4882a593Smuzhiyun ret # .. e1 : 163*4882a593Smuzhiyun 164*4882a593Smuzhiyun .end memchr 165*4882a593Smuzhiyun EXPORT_SYMBOL(memchr) 166