xref: /rk3399_rockchip-uboot/arch/arc/lib/strcmp.S (revision 1ad6364eeb4f578e423081d1748e8a3fdf1ab01d)
1*22723828SAlexey Brodkin/*
2*22723828SAlexey Brodkin * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
3*22723828SAlexey Brodkin *
4*22723828SAlexey Brodkin * SPDX-License-Identifier:	GPL-2.0+
5*22723828SAlexey Brodkin */
6*22723828SAlexey Brodkin
7*22723828SAlexey Brodkin/*
8*22723828SAlexey Brodkin * This is optimized primarily for the ARC700.
9*22723828SAlexey Brodkin * It would be possible to speed up the loops by one cycle / word
10*22723828SAlexey Brodkin * respective one cycle / byte by forcing double source 1 alignment, unrolling
11*22723828SAlexey Brodkin * by a factor of two, and speculatively loading the second word / byte of
12*22723828SAlexey Brodkin * source 1; however, that would increase the overhead for loop setup / finish,
13*22723828SAlexey Brodkin * and strcmp might often terminate early.
14*22723828SAlexey Brodkin */
15*22723828SAlexey Brodkin
16*22723828SAlexey Brodkin.global strcmp
17*22723828SAlexey Brodkin.align 4
18*22723828SAlexey Brodkinstrcmp:
19*22723828SAlexey Brodkin	or	%r2, %r0, %r1
20*22723828SAlexey Brodkin	bmsk_s	%r2, %r2, 1
21*22723828SAlexey Brodkin	brne	%r2, 0, .Lcharloop
22*22723828SAlexey Brodkin	mov_s	%r12, 0x01010101
23*22723828SAlexey Brodkin	ror	%r5, %r12
24*22723828SAlexey Brodkin.Lwordloop:
25*22723828SAlexey Brodkin	ld.ab	%r2, [%r0, 4]
26*22723828SAlexey Brodkin	ld.ab	%r3, [%r1, 4]
27*22723828SAlexey Brodkin	nop_s
28*22723828SAlexey Brodkin	sub	%r4, %r2, %r12
29*22723828SAlexey Brodkin	bic	%r4, %r4, %r2
30*22723828SAlexey Brodkin	and	%r4, %r4, %r5
31*22723828SAlexey Brodkin	brne	%r4, 0, .Lfound0
32*22723828SAlexey Brodkin	breq	%r2 ,%r3, .Lwordloop
33*22723828SAlexey Brodkin#ifdef	__LITTLE_ENDIAN__
34*22723828SAlexey Brodkin	xor	%r0, %r2, %r3	/* mask for difference */
35*22723828SAlexey Brodkin	sub_s	%r1, %r0, 1
36*22723828SAlexey Brodkin	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
37*22723828SAlexey Brodkin	sub	%r1, %r5, %r0
38*22723828SAlexey Brodkin	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
39*22723828SAlexey Brodkin	and_s	%r2, %r2, %r0
40*22723828SAlexey Brodkin	and_s	%r3, %r3, %r0
41*22723828SAlexey Brodkin#endif /* _ENDIAN__ */
42*22723828SAlexey Brodkin	cmp_s	%r2, %r3
43*22723828SAlexey Brodkin	mov_s	%r0, 1
44*22723828SAlexey Brodkin	j_s.d	[%blink]
45*22723828SAlexey Brodkin	bset.lo	%r0, %r0, 31
46*22723828SAlexey Brodkin
47*22723828SAlexey Brodkin	.balign	4
48*22723828SAlexey Brodkin#ifdef __LITTLE_ENDIAN__
49*22723828SAlexey Brodkin.Lfound0:
50*22723828SAlexey Brodkin	xor	%r0, %r2, %r3	/* mask for difference */
51*22723828SAlexey Brodkin	or	%r0, %r0, %r4	/* or in zero indicator */
52*22723828SAlexey Brodkin	sub_s	%r1, %r0, 1
53*22723828SAlexey Brodkin	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
54*22723828SAlexey Brodkin	sub	%r1, %r5, %r0
55*22723828SAlexey Brodkin	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
56*22723828SAlexey Brodkin	and_s	%r2, %r2, %r0
57*22723828SAlexey Brodkin	and_s	%r3, %r3, %r0
58*22723828SAlexey Brodkin	sub.f	%r0, %r2, %r3
59*22723828SAlexey Brodkin	mov.hi	%r0, 1
60*22723828SAlexey Brodkin	j_s.d	[%blink]
61*22723828SAlexey Brodkin	bset.lo	%r0, %r0, 31
62*22723828SAlexey Brodkin#else /* __BIG_ENDIAN__ */
63*22723828SAlexey Brodkin	/*
64*22723828SAlexey Brodkin	 * The zero-detection above can mis-detect 0x01 bytes as zeroes
65*22723828SAlexey Brodkin	 * because of carry-propagateion from a lower significant zero byte.
66*22723828SAlexey Brodkin	 * We can compensate for this by checking that bit0 is zero.
67*22723828SAlexey Brodkin	 * This compensation is not necessary in the step where we
68*22723828SAlexey Brodkin	 * get a low estimate for r2, because in any affected bytes
69*22723828SAlexey Brodkin	 * we already have 0x00 or 0x01, which will remain unchanged
70*22723828SAlexey Brodkin	 * when bit 7 is cleared.
71*22723828SAlexey Brodkin	 */
72*22723828SAlexey Brodkin	.balign	4
73*22723828SAlexey Brodkin.Lfound0:
74*22723828SAlexey Brodkin	lsr	%r0, %r4, 8
75*22723828SAlexey Brodkin	lsr_s	%r1, %r2
76*22723828SAlexey Brodkin	bic_s	%r2, %r2, %r0	/* get low estimate for r2 and get ... */
77*22723828SAlexey Brodkin	bic_s	%r0, %r0, %r1	/* <this is the adjusted mask for zeros> */
78*22723828SAlexey Brodkin	or_s	%r3, %r3, %r0	/* ... high estimate r3 so that r2 > r3 will */
79*22723828SAlexey Brodkin	cmp_s	%r3, %r2	/* ... be independent of trailing garbage */
80*22723828SAlexey Brodkin	or_s	%r2, %r2, %r0	/* likewise for r3 > r2 */
81*22723828SAlexey Brodkin	bic_s	%r3, %r3, %r0
82*22723828SAlexey Brodkin	rlc	%r0, 0		/* r0 := r2 > r3 ? 1 : 0 */
83*22723828SAlexey Brodkin	cmp_s	%r2, %r3
84*22723828SAlexey Brodkin	j_s.d	[%blink]
85*22723828SAlexey Brodkin	bset.lo	%r0, %r0, 31
86*22723828SAlexey Brodkin#endif /* _ENDIAN__ */
87*22723828SAlexey Brodkin
88*22723828SAlexey Brodkin	.balign	4
89*22723828SAlexey Brodkin.Lcharloop:
90*22723828SAlexey Brodkin	ldb.ab	%r2,[%r0,1]
91*22723828SAlexey Brodkin	ldb.ab	%r3,[%r1,1]
92*22723828SAlexey Brodkin	nop_s
93*22723828SAlexey Brodkin	breq	%r2, 0, .Lcmpend
94*22723828SAlexey Brodkin	breq	%r2, %r3, .Lcharloop
95*22723828SAlexey Brodkin.Lcmpend:
96*22723828SAlexey Brodkin	j_s.d	[%blink]
97*22723828SAlexey Brodkin	sub	%r0, %r2, %r3
98