xref: /OK3568_Linux_fs/u-boot/arch/arc/lib/strcmp.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
3 *
4 * SPDX-License-Identifier:	GPL-2.0+
5 */
6
7/*
8 * This is optimized primarily for the ARC700.
9 * It would be possible to speed up the loops by one cycle / word
10 * respective one cycle / byte by forcing double source 1 alignment, unrolling
11 * by a factor of two, and speculatively loading the second word / byte of
12 * source 1; however, that would increase the overhead for loop setup / finish,
13 * and strcmp might often terminate early.
14 */
15
16.global strcmp
17.align 4
18strcmp:
19	or	%r2, %r0, %r1
20	bmsk_s	%r2, %r2, 1
21	brne	%r2, 0, .Lcharloop
22	mov_s	%r12, 0x01010101
23	ror	%r5, %r12
24.Lwordloop:
25	ld.ab	%r2, [%r0, 4]
26	ld.ab	%r3, [%r1, 4]
27	nop_s
28	sub	%r4, %r2, %r12
29	bic	%r4, %r4, %r2
30	and	%r4, %r4, %r5
31	brne	%r4, 0, .Lfound0
32	breq	%r2 ,%r3, .Lwordloop
33#ifdef	__LITTLE_ENDIAN__
34	xor	%r0, %r2, %r3	/* mask for difference */
35	sub_s	%r1, %r0, 1
36	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
37	sub	%r1, %r5, %r0
38	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
39	and_s	%r2, %r2, %r0
40	and_s	%r3, %r3, %r0
41#endif /* _ENDIAN__ */
42	cmp_s	%r2, %r3
43	mov_s	%r0, 1
44	j_s.d	[%blink]
45	bset.lo	%r0, %r0, 31
46
47	.balign	4
48#ifdef __LITTLE_ENDIAN__
49.Lfound0:
50	xor	%r0, %r2, %r3	/* mask for difference */
51	or	%r0, %r0, %r4	/* or in zero indicator */
52	sub_s	%r1, %r0, 1
53	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
54	sub	%r1, %r5, %r0
55	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
56	and_s	%r2, %r2, %r0
57	and_s	%r3, %r3, %r0
58	sub.f	%r0, %r2, %r3
59	mov.hi	%r0, 1
60	j_s.d	[%blink]
61	bset.lo	%r0, %r0, 31
62#else /* __BIG_ENDIAN__ */
63	/*
64	 * The zero-detection above can mis-detect 0x01 bytes as zeroes
65	 * because of carry-propagateion from a lower significant zero byte.
66	 * We can compensate for this by checking that bit0 is zero.
67	 * This compensation is not necessary in the step where we
68	 * get a low estimate for r2, because in any affected bytes
69	 * we already have 0x00 or 0x01, which will remain unchanged
70	 * when bit 7 is cleared.
71	 */
72	.balign	4
73.Lfound0:
74	lsr	%r0, %r4, 8
75	lsr_s	%r1, %r2
76	bic_s	%r2, %r2, %r0	/* get low estimate for r2 and get ... */
77	bic_s	%r0, %r0, %r1	/* <this is the adjusted mask for zeros> */
78	or_s	%r3, %r3, %r0	/* ... high estimate r3 so that r2 > r3 will */
79	cmp_s	%r3, %r2	/* ... be independent of trailing garbage */
80	or_s	%r2, %r2, %r0	/* likewise for r3 > r2 */
81	bic_s	%r3, %r3, %r0
82	rlc	%r0, 0		/* r0 := r2 > r3 ? 1 : 0 */
83	cmp_s	%r2, %r3
84	j_s.d	[%blink]
85	bset.lo	%r0, %r0, 31
86#endif /* _ENDIAN__ */
87
88	.balign	4
89.Lcharloop:
90	ldb.ab	%r2,[%r0,1]
91	ldb.ab	%r3,[%r1,1]
92	nop_s
93	breq	%r2, 0, .Lcmpend
94	breq	%r2, %r3, .Lcharloop
95.Lcmpend:
96	j_s.d	[%blink]
97	sub	%r0, %r2, %r3
98