1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * SPDX-License-Identifier: GPL-2.0+ 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun/* 8*4882a593Smuzhiyun * This is optimized primarily for the ARC700. 9*4882a593Smuzhiyun * It would be possible to speed up the loops by one cycle / word 10*4882a593Smuzhiyun * respective one cycle / byte by forcing double source 1 alignment, unrolling 11*4882a593Smuzhiyun * by a factor of two, and speculatively loading the second word / byte of 12*4882a593Smuzhiyun * source 1; however, that would increase the overhead for loop setup / finish, 13*4882a593Smuzhiyun * and strcmp might often terminate early. 14*4882a593Smuzhiyun */ 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun.global strcmp 17*4882a593Smuzhiyun.align 4 18*4882a593Smuzhiyunstrcmp: 19*4882a593Smuzhiyun or %r2, %r0, %r1 20*4882a593Smuzhiyun bmsk_s %r2, %r2, 1 21*4882a593Smuzhiyun brne %r2, 0, .Lcharloop 22*4882a593Smuzhiyun mov_s %r12, 0x01010101 23*4882a593Smuzhiyun ror %r5, %r12 24*4882a593Smuzhiyun.Lwordloop: 25*4882a593Smuzhiyun ld.ab %r2, [%r0, 4] 26*4882a593Smuzhiyun ld.ab %r3, [%r1, 4] 27*4882a593Smuzhiyun nop_s 28*4882a593Smuzhiyun sub %r4, %r2, %r12 29*4882a593Smuzhiyun bic %r4, %r4, %r2 30*4882a593Smuzhiyun and %r4, %r4, %r5 31*4882a593Smuzhiyun brne %r4, 0, .Lfound0 32*4882a593Smuzhiyun breq %r2 ,%r3, .Lwordloop 33*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__ 34*4882a593Smuzhiyun xor %r0, %r2, %r3 /* mask for difference */ 35*4882a593Smuzhiyun sub_s %r1, %r0, 1 36*4882a593Smuzhiyun bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ 37*4882a593Smuzhiyun sub %r1, %r5, %r0 38*4882a593Smuzhiyun xor %r0, %r5, %r1 /* mask for least significant difference byte */ 39*4882a593Smuzhiyun and_s %r2, %r2, %r0 40*4882a593Smuzhiyun and_s %r3, %r3, %r0 41*4882a593Smuzhiyun#endif /* _ENDIAN__ */ 42*4882a593Smuzhiyun cmp_s %r2, %r3 43*4882a593Smuzhiyun mov_s %r0, 1 44*4882a593Smuzhiyun j_s.d [%blink] 45*4882a593Smuzhiyun bset.lo %r0, %r0, 31 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun .balign 4 48*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__ 49*4882a593Smuzhiyun.Lfound0: 50*4882a593Smuzhiyun xor %r0, %r2, %r3 /* mask for difference */ 51*4882a593Smuzhiyun or %r0, %r0, %r4 /* or in zero indicator */ 52*4882a593Smuzhiyun sub_s %r1, %r0, 1 53*4882a593Smuzhiyun bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ 54*4882a593Smuzhiyun sub %r1, %r5, %r0 55*4882a593Smuzhiyun xor %r0, %r5, %r1 /* mask for least significant difference byte */ 56*4882a593Smuzhiyun and_s %r2, %r2, %r0 57*4882a593Smuzhiyun and_s %r3, %r3, %r0 58*4882a593Smuzhiyun sub.f %r0, %r2, %r3 59*4882a593Smuzhiyun mov.hi %r0, 1 60*4882a593Smuzhiyun j_s.d [%blink] 61*4882a593Smuzhiyun bset.lo %r0, %r0, 31 62*4882a593Smuzhiyun#else /* __BIG_ENDIAN__ */ 63*4882a593Smuzhiyun /* 64*4882a593Smuzhiyun * The zero-detection above can mis-detect 0x01 bytes as zeroes 65*4882a593Smuzhiyun * because of carry-propagateion from a lower significant zero byte. 66*4882a593Smuzhiyun * We can compensate for this by checking that bit0 is zero. 67*4882a593Smuzhiyun * This compensation is not necessary in the step where we 68*4882a593Smuzhiyun * get a low estimate for r2, because in any affected bytes 69*4882a593Smuzhiyun * we already have 0x00 or 0x01, which will remain unchanged 70*4882a593Smuzhiyun * when bit 7 is cleared. 71*4882a593Smuzhiyun */ 72*4882a593Smuzhiyun .balign 4 73*4882a593Smuzhiyun.Lfound0: 74*4882a593Smuzhiyun lsr %r0, %r4, 8 75*4882a593Smuzhiyun lsr_s %r1, %r2 76*4882a593Smuzhiyun bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ 77*4882a593Smuzhiyun bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */ 78*4882a593Smuzhiyun or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ 79*4882a593Smuzhiyun cmp_s %r3, %r2 /* ... be independent of trailing garbage */ 80*4882a593Smuzhiyun or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ 81*4882a593Smuzhiyun bic_s %r3, %r3, %r0 82*4882a593Smuzhiyun rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ 83*4882a593Smuzhiyun cmp_s %r2, %r3 84*4882a593Smuzhiyun j_s.d [%blink] 85*4882a593Smuzhiyun bset.lo %r0, %r0, 31 86*4882a593Smuzhiyun#endif /* _ENDIAN__ */ 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun .balign 4 89*4882a593Smuzhiyun.Lcharloop: 90*4882a593Smuzhiyun ldb.ab %r2,[%r0,1] 91*4882a593Smuzhiyun ldb.ab %r3,[%r1,1] 92*4882a593Smuzhiyun nop_s 93*4882a593Smuzhiyun breq %r2, 0, .Lcmpend 94*4882a593Smuzhiyun breq %r2, %r3, .Lcharloop 95*4882a593Smuzhiyun.Lcmpend: 96*4882a593Smuzhiyun j_s.d [%blink] 97*4882a593Smuzhiyun sub %r0, %r2, %r3 98