1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) 4*4882a593Smuzhiyun */ 5*4882a593Smuzhiyun 6*4882a593Smuzhiyun/* This is optimized primarily for the ARC700. 7*4882a593Smuzhiyun It would be possible to speed up the loops by one cycle / word 8*4882a593Smuzhiyun respective one cycle / byte by forcing double source 1 alignment, unrolling 9*4882a593Smuzhiyun by a factor of two, and speculatively loading the second word / byte of 10*4882a593Smuzhiyun source 1; however, that would increase the overhead for loop setup / finish, 11*4882a593Smuzhiyun and strcmp might often terminate early. */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#include <linux/linkage.h> 14*4882a593Smuzhiyun 15*4882a593SmuzhiyunENTRY_CFI(strcmp) 16*4882a593Smuzhiyun or r2,r0,r1 17*4882a593Smuzhiyun bmsk_s r2,r2,1 18*4882a593Smuzhiyun brne r2,0,.Lcharloop 19*4882a593Smuzhiyun mov_s r12,0x01010101 20*4882a593Smuzhiyun ror r5,r12 21*4882a593Smuzhiyun.Lwordloop: 22*4882a593Smuzhiyun ld.ab r2,[r0,4] 23*4882a593Smuzhiyun ld.ab r3,[r1,4] 24*4882a593Smuzhiyun nop_s 25*4882a593Smuzhiyun sub r4,r2,r12 26*4882a593Smuzhiyun bic r4,r4,r2 27*4882a593Smuzhiyun and r4,r4,r5 28*4882a593Smuzhiyun brne r4,0,.Lfound0 29*4882a593Smuzhiyun breq r2,r3,.Lwordloop 30*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__ 31*4882a593Smuzhiyun xor r0,r2,r3 ; mask for difference 32*4882a593Smuzhiyun sub_s r1,r0,1 33*4882a593Smuzhiyun bic_s r0,r0,r1 ; mask for least significant difference bit 34*4882a593Smuzhiyun sub r1,r5,r0 35*4882a593Smuzhiyun xor r0,r5,r1 ; mask for least significant difference byte 36*4882a593Smuzhiyun and_s r2,r2,r0 37*4882a593Smuzhiyun and_s r3,r3,r0 38*4882a593Smuzhiyun#endif /* LITTLE ENDIAN */ 39*4882a593Smuzhiyun cmp_s r2,r3 40*4882a593Smuzhiyun mov_s r0,1 41*4882a593Smuzhiyun j_s.d [blink] 42*4882a593Smuzhiyun bset.lo r0,r0,31 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun .balign 4 45*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__ 46*4882a593Smuzhiyun.Lfound0: 47*4882a593Smuzhiyun xor r0,r2,r3 ; mask for difference 48*4882a593Smuzhiyun or r0,r0,r4 ; or in zero indicator 49*4882a593Smuzhiyun sub_s r1,r0,1 50*4882a593Smuzhiyun bic_s r0,r0,r1 ; mask for least significant difference bit 51*4882a593Smuzhiyun sub r1,r5,r0 52*4882a593Smuzhiyun xor r0,r5,r1 ; mask for least significant difference byte 53*4882a593Smuzhiyun and_s r2,r2,r0 54*4882a593Smuzhiyun and_s r3,r3,r0 55*4882a593Smuzhiyun sub.f r0,r2,r3 56*4882a593Smuzhiyun mov.hi r0,1 57*4882a593Smuzhiyun j_s.d [blink] 58*4882a593Smuzhiyun bset.lo r0,r0,31 59*4882a593Smuzhiyun#else /* BIG ENDIAN */ 60*4882a593Smuzhiyun /* The zero-detection above can mis-detect 0x01 bytes as zeroes 61*4882a593Smuzhiyun because of carry-propagateion from a lower significant zero byte. 62*4882a593Smuzhiyun We can compensate for this by checking that bit0 is zero. 63*4882a593Smuzhiyun This compensation is not necessary in the step where we 64*4882a593Smuzhiyun get a low estimate for r2, because in any affected bytes 65*4882a593Smuzhiyun we already have 0x00 or 0x01, which will remain unchanged 66*4882a593Smuzhiyun when bit 7 is cleared. */ 67*4882a593Smuzhiyun .balign 4 68*4882a593Smuzhiyun.Lfound0: 69*4882a593Smuzhiyun lsr r0,r4,8 70*4882a593Smuzhiyun lsr_s r1,r2 71*4882a593Smuzhiyun bic_s r2,r2,r0 ; get low estimate for r2 and get ... 72*4882a593Smuzhiyun bic_s r0,r0,r1 ; <this is the adjusted mask for zeros> 73*4882a593Smuzhiyun or_s r3,r3,r0 ; ... high estimate r3 so that r2 > r3 will ... 74*4882a593Smuzhiyun cmp_s r3,r2 ; ... be independent of trailing garbage 75*4882a593Smuzhiyun or_s r2,r2,r0 ; likewise for r3 > r2 76*4882a593Smuzhiyun bic_s r3,r3,r0 77*4882a593Smuzhiyun rlc r0,0 ; r0 := r2 > r3 ? 1 : 0 78*4882a593Smuzhiyun cmp_s r2,r3 79*4882a593Smuzhiyun j_s.d [blink] 80*4882a593Smuzhiyun bset.lo r0,r0,31 81*4882a593Smuzhiyun#endif /* ENDIAN */ 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun .balign 4 84*4882a593Smuzhiyun.Lcharloop: 85*4882a593Smuzhiyun ldb.ab r2,[r0,1] 86*4882a593Smuzhiyun ldb.ab r3,[r1,1] 87*4882a593Smuzhiyun nop_s 88*4882a593Smuzhiyun breq r2,0,.Lcmpend 89*4882a593Smuzhiyun breq r2,r3,.Lcharloop 90*4882a593Smuzhiyun.Lcmpend: 91*4882a593Smuzhiyun j_s.d [blink] 92*4882a593Smuzhiyun sub r0,r2,r3 93*4882a593SmuzhiyunEND_CFI(strcmp) 94