1/* 2 * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. 3 * 4 * SPDX-License-Identifier: GPL-2.0+ 5 */ 6 7/* 8 * This is optimized primarily for the ARC700. 9 * It would be possible to speed up the loops by one cycle / word 10 * respective one cycle / byte by forcing double source 1 alignment, unrolling 11 * by a factor of two, and speculatively loading the second word / byte of 12 * source 1; however, that would increase the overhead for loop setup / finish, 13 * and strcmp might often terminate early. 14 */ 15 16.global strcmp 17.align 4 18strcmp: 19 or %r2, %r0, %r1 20 bmsk_s %r2, %r2, 1 21 brne %r2, 0, .Lcharloop 22 mov_s %r12, 0x01010101 23 ror %r5, %r12 24.Lwordloop: 25 ld.ab %r2, [%r0, 4] 26 ld.ab %r3, [%r1, 4] 27 nop_s 28 sub %r4, %r2, %r12 29 bic %r4, %r4, %r2 30 and %r4, %r4, %r5 31 brne %r4, 0, .Lfound0 32 breq %r2 ,%r3, .Lwordloop 33#ifdef __LITTLE_ENDIAN__ 34 xor %r0, %r2, %r3 /* mask for difference */ 35 sub_s %r1, %r0, 1 36 bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ 37 sub %r1, %r5, %r0 38 xor %r0, %r5, %r1 /* mask for least significant difference byte */ 39 and_s %r2, %r2, %r0 40 and_s %r3, %r3, %r0 41#endif /* _ENDIAN__ */ 42 cmp_s %r2, %r3 43 mov_s %r0, 1 44 j_s.d [%blink] 45 bset.lo %r0, %r0, 31 46 47 .balign 4 48#ifdef __LITTLE_ENDIAN__ 49.Lfound0: 50 xor %r0, %r2, %r3 /* mask for difference */ 51 or %r0, %r0, %r4 /* or in zero indicator */ 52 sub_s %r1, %r0, 1 53 bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ 54 sub %r1, %r5, %r0 55 xor %r0, %r5, %r1 /* mask for least significant difference byte */ 56 and_s %r2, %r2, %r0 57 and_s %r3, %r3, %r0 58 sub.f %r0, %r2, %r3 59 mov.hi %r0, 1 60 j_s.d [%blink] 61 bset.lo %r0, %r0, 31 62#else /* __BIG_ENDIAN__ */ 63 /* 64 * The zero-detection above can mis-detect 0x01 bytes as zeroes 65 * because of carry-propagateion from a lower significant zero byte. 66 * We can compensate for this by checking that bit0 is zero. 67 * This compensation is not necessary in the step where we 68 * get a low estimate for r2, because in any affected bytes 69 * we already have 0x00 or 0x01, which will remain unchanged 70 * when bit 7 is cleared. 71 */ 72 .balign 4 73.Lfound0: 74 lsr %r0, %r4, 8 75 lsr_s %r1, %r2 76 bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ 77 bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */ 78 or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ 79 cmp_s %r3, %r2 /* ... be independent of trailing garbage */ 80 or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ 81 bic_s %r3, %r3, %r0 82 rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ 83 cmp_s %r2, %r3 84 j_s.d [%blink] 85 bset.lo %r0, %r0, 31 86#endif /* _ENDIAN__ */ 87 88 .balign 4 89.Lcharloop: 90 ldb.ab %r2,[%r0,1] 91 ldb.ab %r3,[%r1,1] 92 nop_s 93 breq %r2, 0, .Lcmpend 94 breq %r2, %r3, .Lcharloop 95.Lcmpend: 96 j_s.d [%blink] 97 sub %r0, %r2, %r3 98