xref: /OK3568_Linux_fs/kernel/arch/arc/lib/strcmp.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun/* This is optimized primarily for the ARC700.
7*4882a593Smuzhiyun   It would be possible to speed up the loops by one cycle / word
8*4882a593Smuzhiyun   respective one cycle / byte by forcing double source 1 alignment, unrolling
9*4882a593Smuzhiyun   by a factor of two, and speculatively loading the second word / byte of
10*4882a593Smuzhiyun   source 1; however, that would increase the overhead for loop setup / finish,
11*4882a593Smuzhiyun   and strcmp might often terminate early.  */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <linux/linkage.h>
14*4882a593Smuzhiyun
15*4882a593SmuzhiyunENTRY_CFI(strcmp)
16*4882a593Smuzhiyun	or	r2,r0,r1
17*4882a593Smuzhiyun	bmsk_s	r2,r2,1
18*4882a593Smuzhiyun	brne	r2,0,.Lcharloop
19*4882a593Smuzhiyun	mov_s	r12,0x01010101
20*4882a593Smuzhiyun	ror	r5,r12
21*4882a593Smuzhiyun.Lwordloop:
22*4882a593Smuzhiyun	ld.ab	r2,[r0,4]
23*4882a593Smuzhiyun	ld.ab	r3,[r1,4]
24*4882a593Smuzhiyun	nop_s
25*4882a593Smuzhiyun	sub	r4,r2,r12
26*4882a593Smuzhiyun	bic	r4,r4,r2
27*4882a593Smuzhiyun	and	r4,r4,r5
28*4882a593Smuzhiyun	brne	r4,0,.Lfound0
29*4882a593Smuzhiyun	breq	r2,r3,.Lwordloop
30*4882a593Smuzhiyun#ifdef	__LITTLE_ENDIAN__
31*4882a593Smuzhiyun	xor	r0,r2,r3	; mask for difference
32*4882a593Smuzhiyun	sub_s	r1,r0,1
33*4882a593Smuzhiyun	bic_s	r0,r0,r1	; mask for least significant difference bit
34*4882a593Smuzhiyun	sub	r1,r5,r0
35*4882a593Smuzhiyun	xor	r0,r5,r1	; mask for least significant difference byte
36*4882a593Smuzhiyun	and_s	r2,r2,r0
37*4882a593Smuzhiyun	and_s	r3,r3,r0
38*4882a593Smuzhiyun#endif /* LITTLE ENDIAN */
39*4882a593Smuzhiyun	cmp_s	r2,r3
40*4882a593Smuzhiyun	mov_s	r0,1
41*4882a593Smuzhiyun	j_s.d	[blink]
42*4882a593Smuzhiyun	bset.lo	r0,r0,31
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun	.balign	4
45*4882a593Smuzhiyun#ifdef __LITTLE_ENDIAN__
46*4882a593Smuzhiyun.Lfound0:
47*4882a593Smuzhiyun	xor	r0,r2,r3	; mask for difference
48*4882a593Smuzhiyun	or	r0,r0,r4	; or in zero indicator
49*4882a593Smuzhiyun	sub_s	r1,r0,1
50*4882a593Smuzhiyun	bic_s	r0,r0,r1	; mask for least significant difference bit
51*4882a593Smuzhiyun	sub	r1,r5,r0
52*4882a593Smuzhiyun	xor	r0,r5,r1	; mask for least significant difference byte
53*4882a593Smuzhiyun	and_s	r2,r2,r0
54*4882a593Smuzhiyun	and_s	r3,r3,r0
55*4882a593Smuzhiyun	sub.f	r0,r2,r3
56*4882a593Smuzhiyun	mov.hi	r0,1
57*4882a593Smuzhiyun	j_s.d	[blink]
58*4882a593Smuzhiyun	bset.lo	r0,r0,31
59*4882a593Smuzhiyun#else /* BIG ENDIAN */
60*4882a593Smuzhiyun	/* The zero-detection above can mis-detect 0x01 bytes as zeroes
61*4882a593Smuzhiyun	   because of carry-propagateion from a lower significant zero byte.
62*4882a593Smuzhiyun	   We can compensate for this by checking that bit0 is zero.
63*4882a593Smuzhiyun	   This compensation is not necessary in the step where we
64*4882a593Smuzhiyun	   get a low estimate for r2, because in any affected bytes
65*4882a593Smuzhiyun	   we already have 0x00 or 0x01, which will remain unchanged
66*4882a593Smuzhiyun	   when bit 7 is cleared.  */
67*4882a593Smuzhiyun	.balign	4
68*4882a593Smuzhiyun.Lfound0:
69*4882a593Smuzhiyun	lsr	r0,r4,8
70*4882a593Smuzhiyun	lsr_s	r1,r2
71*4882a593Smuzhiyun	bic_s	r2,r2,r0	; get low estimate for r2 and get ...
72*4882a593Smuzhiyun	bic_s	r0,r0,r1	; <this is the adjusted mask for zeros>
73*4882a593Smuzhiyun	or_s	r3,r3,r0	; ... high estimate r3 so that r2 > r3 will ...
74*4882a593Smuzhiyun	cmp_s	r3,r2		; ... be independent of trailing garbage
75*4882a593Smuzhiyun	or_s	r2,r2,r0	; likewise for r3 > r2
76*4882a593Smuzhiyun	bic_s	r3,r3,r0
77*4882a593Smuzhiyun	rlc	r0,0		; r0 := r2 > r3 ? 1 : 0
78*4882a593Smuzhiyun	cmp_s	r2,r3
79*4882a593Smuzhiyun	j_s.d	[blink]
80*4882a593Smuzhiyun	bset.lo	r0,r0,31
81*4882a593Smuzhiyun#endif /* ENDIAN */
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	.balign	4
84*4882a593Smuzhiyun.Lcharloop:
85*4882a593Smuzhiyun	ldb.ab	r2,[r0,1]
86*4882a593Smuzhiyun	ldb.ab	r3,[r1,1]
87*4882a593Smuzhiyun	nop_s
88*4882a593Smuzhiyun	breq	r2,0,.Lcmpend
89*4882a593Smuzhiyun	breq	r2,r3,.Lcharloop
90*4882a593Smuzhiyun.Lcmpend:
91*4882a593Smuzhiyun	j_s.d	[blink]
92*4882a593Smuzhiyun	sub	r0,r2,r3
93*4882a593SmuzhiyunEND_CFI(strcmp)
94