1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd. 4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro 7*4882a593Smuzhiyun * be found @ 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10*4882a593Smuzhiyun * files/head:/src/aarch64/ 11*4882a593Smuzhiyun */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#include <linux/linkage.h> 14*4882a593Smuzhiyun#include <asm/assembler.h> 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun/* 17*4882a593Smuzhiyun * calculate the length of a string 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * Parameters: 20*4882a593Smuzhiyun * x0 - const string pointer 21*4882a593Smuzhiyun * Returns: 22*4882a593Smuzhiyun * x0 - the return length of specific string 23*4882a593Smuzhiyun */ 24*4882a593Smuzhiyun 25*4882a593Smuzhiyun/* Arguments and results. */ 26*4882a593Smuzhiyunsrcin .req x0 27*4882a593Smuzhiyunlen .req x0 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun/* Locals and temporaries. */ 30*4882a593Smuzhiyunsrc .req x1 31*4882a593Smuzhiyundata1 .req x2 32*4882a593Smuzhiyundata2 .req x3 33*4882a593Smuzhiyundata2a .req x4 34*4882a593Smuzhiyunhas_nul1 .req x5 35*4882a593Smuzhiyunhas_nul2 .req x6 36*4882a593Smuzhiyuntmp1 .req x7 37*4882a593Smuzhiyuntmp2 .req x8 38*4882a593Smuzhiyuntmp3 .req x9 39*4882a593Smuzhiyuntmp4 .req x10 40*4882a593Smuzhiyunzeroones .req x11 41*4882a593Smuzhiyunpos .req x12 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun#define REP8_01 0x0101010101010101 44*4882a593Smuzhiyun#define REP8_7f 0x7f7f7f7f7f7f7f7f 45*4882a593Smuzhiyun#define REP8_80 0x8080808080808080 46*4882a593Smuzhiyun 47*4882a593SmuzhiyunSYM_FUNC_START_WEAK_PI(strlen) 48*4882a593Smuzhiyun mov zeroones, #REP8_01 49*4882a593Smuzhiyun bic src, srcin, #15 50*4882a593Smuzhiyun ands tmp1, srcin, #15 51*4882a593Smuzhiyun b.ne .Lmisaligned 52*4882a593Smuzhiyun /* 53*4882a593Smuzhiyun * NUL detection works on the principle that (X - 1) & (~X) & 0x80 54*4882a593Smuzhiyun * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 55*4882a593Smuzhiyun * can be done in parallel across the entire word. 56*4882a593Smuzhiyun */ 57*4882a593Smuzhiyun /* 58*4882a593Smuzhiyun * The inner loop deals with two Dwords at a time. This has a 59*4882a593Smuzhiyun * slightly higher start-up cost, but we should win quite quickly, 60*4882a593Smuzhiyun * especially on cores with a high number of issue slots per 61*4882a593Smuzhiyun * cycle, as we get much better parallelism out of the operations. 62*4882a593Smuzhiyun */ 63*4882a593Smuzhiyun.Lloop: 64*4882a593Smuzhiyun ldp data1, data2, [src], #16 65*4882a593Smuzhiyun.Lrealigned: 66*4882a593Smuzhiyun sub tmp1, data1, zeroones 67*4882a593Smuzhiyun orr tmp2, data1, #REP8_7f 68*4882a593Smuzhiyun sub tmp3, data2, zeroones 69*4882a593Smuzhiyun orr tmp4, data2, #REP8_7f 70*4882a593Smuzhiyun bic has_nul1, tmp1, tmp2 71*4882a593Smuzhiyun bics has_nul2, tmp3, tmp4 72*4882a593Smuzhiyun ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 73*4882a593Smuzhiyun b.eq .Lloop 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun sub len, src, srcin 76*4882a593Smuzhiyun cbz has_nul1, .Lnul_in_data2 77*4882a593SmuzhiyunCPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ 78*4882a593Smuzhiyun sub len, len, #8 79*4882a593Smuzhiyun mov has_nul2, has_nul1 80*4882a593Smuzhiyun.Lnul_in_data2: 81*4882a593Smuzhiyun /* 82*4882a593Smuzhiyun * For big-endian, carry propagation (if the final byte in the 83*4882a593Smuzhiyun * string is 0x01) means we cannot use has_nul directly. The 84*4882a593Smuzhiyun * easiest way to get the correct byte is to byte-swap the data 85*4882a593Smuzhiyun * and calculate the syndrome a second time. 86*4882a593Smuzhiyun */ 87*4882a593SmuzhiyunCPU_BE( rev data2, data2 ) 88*4882a593SmuzhiyunCPU_BE( sub tmp1, data2, zeroones ) 89*4882a593SmuzhiyunCPU_BE( orr tmp2, data2, #REP8_7f ) 90*4882a593SmuzhiyunCPU_BE( bic has_nul2, tmp1, tmp2 ) 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun sub len, len, #8 93*4882a593Smuzhiyun rev has_nul2, has_nul2 94*4882a593Smuzhiyun clz pos, has_nul2 95*4882a593Smuzhiyun add len, len, pos, lsr #3 /* Bits to bytes. */ 96*4882a593Smuzhiyun ret 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun.Lmisaligned: 99*4882a593Smuzhiyun cmp tmp1, #8 100*4882a593Smuzhiyun neg tmp1, tmp1 101*4882a593Smuzhiyun ldp data1, data2, [src], #16 102*4882a593Smuzhiyun lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 103*4882a593Smuzhiyun mov tmp2, #~0 104*4882a593Smuzhiyun /* Big-endian. Early bytes are at MSB. */ 105*4882a593SmuzhiyunCPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 106*4882a593Smuzhiyun /* Little-endian. Early bytes are at LSB. */ 107*4882a593SmuzhiyunCPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun orr data1, data1, tmp2 110*4882a593Smuzhiyun orr data2a, data2, tmp2 111*4882a593Smuzhiyun csinv data1, data1, xzr, le 112*4882a593Smuzhiyun csel data2, data2, data2a, le 113*4882a593Smuzhiyun b .Lrealigned 114*4882a593SmuzhiyunSYM_FUNC_END_PI(strlen) 115*4882a593SmuzhiyunEXPORT_SYMBOL_NOKASAN(strlen) 116