1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Copyright (C) 2013 ARM Ltd. 4*4882a593Smuzhiyun * Copyright (C) 2013 Linaro. 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * This code is based on glibc cortex strings work originally authored by Linaro 7*4882a593Smuzhiyun * be found @ 8*4882a593Smuzhiyun * 9*4882a593Smuzhiyun * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10*4882a593Smuzhiyun * files/head:/src/aarch64/ 11*4882a593Smuzhiyun */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#include <linux/linkage.h> 14*4882a593Smuzhiyun#include <asm/assembler.h> 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun/* 17*4882a593Smuzhiyun * determine the length of a fixed-size string 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * Parameters: 20*4882a593Smuzhiyun * x0 - const string pointer 21*4882a593Smuzhiyun * x1 - maximal string length 22*4882a593Smuzhiyun * Returns: 23*4882a593Smuzhiyun * x0 - the return length of specific string 24*4882a593Smuzhiyun */ 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun/* Arguments and results. */ 27*4882a593Smuzhiyunsrcin .req x0 28*4882a593Smuzhiyunlen .req x0 29*4882a593Smuzhiyunlimit .req x1 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun/* Locals and temporaries. */ 32*4882a593Smuzhiyunsrc .req x2 33*4882a593Smuzhiyundata1 .req x3 34*4882a593Smuzhiyundata2 .req x4 35*4882a593Smuzhiyundata2a .req x5 36*4882a593Smuzhiyunhas_nul1 .req x6 37*4882a593Smuzhiyunhas_nul2 .req x7 38*4882a593Smuzhiyuntmp1 .req x8 39*4882a593Smuzhiyuntmp2 .req x9 40*4882a593Smuzhiyuntmp3 .req x10 41*4882a593Smuzhiyuntmp4 .req x11 42*4882a593Smuzhiyunzeroones .req x12 43*4882a593Smuzhiyunpos .req x13 44*4882a593Smuzhiyunlimit_wd .req x14 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun#define REP8_01 0x0101010101010101 47*4882a593Smuzhiyun#define REP8_7f 0x7f7f7f7f7f7f7f7f 48*4882a593Smuzhiyun#define REP8_80 0x8080808080808080 49*4882a593Smuzhiyun 50*4882a593SmuzhiyunSYM_FUNC_START_WEAK_PI(strnlen) 51*4882a593Smuzhiyun cbz limit, .Lhit_limit 52*4882a593Smuzhiyun mov zeroones, #REP8_01 53*4882a593Smuzhiyun bic src, srcin, #15 54*4882a593Smuzhiyun ands tmp1, srcin, #15 55*4882a593Smuzhiyun b.ne .Lmisaligned 56*4882a593Smuzhiyun /* Calculate the number of full and partial words -1. */ 57*4882a593Smuzhiyun sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 58*4882a593Smuzhiyun lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun /* 61*4882a593Smuzhiyun * NUL detection works on the principle that (X - 1) & (~X) & 0x80 62*4882a593Smuzhiyun * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 63*4882a593Smuzhiyun * can be done in parallel across the entire word. 64*4882a593Smuzhiyun */ 65*4882a593Smuzhiyun /* 66*4882a593Smuzhiyun * The inner loop deals with two Dwords at a time. This has a 67*4882a593Smuzhiyun * slightly higher start-up cost, but we should win quite quickly, 68*4882a593Smuzhiyun * especially on cores with a high number of issue slots per 69*4882a593Smuzhiyun * cycle, as we get much better parallelism out of the operations. 70*4882a593Smuzhiyun */ 71*4882a593Smuzhiyun.Lloop: 72*4882a593Smuzhiyun ldp data1, data2, [src], #16 73*4882a593Smuzhiyun.Lrealigned: 74*4882a593Smuzhiyun sub tmp1, data1, zeroones 75*4882a593Smuzhiyun orr tmp2, data1, #REP8_7f 76*4882a593Smuzhiyun sub tmp3, data2, zeroones 77*4882a593Smuzhiyun orr tmp4, data2, #REP8_7f 78*4882a593Smuzhiyun bic has_nul1, tmp1, tmp2 79*4882a593Smuzhiyun bic has_nul2, tmp3, tmp4 80*4882a593Smuzhiyun subs limit_wd, limit_wd, #1 81*4882a593Smuzhiyun orr tmp1, has_nul1, has_nul2 82*4882a593Smuzhiyun ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 83*4882a593Smuzhiyun b.eq .Lloop 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun cbz tmp1, .Lhit_limit /* No null in final Qword. */ 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun /* 88*4882a593Smuzhiyun * We know there's a null in the final Qword. The easiest thing 89*4882a593Smuzhiyun * to do now is work out the length of the string and return 90*4882a593Smuzhiyun * MIN (len, limit). 91*4882a593Smuzhiyun */ 92*4882a593Smuzhiyun sub len, src, srcin 93*4882a593Smuzhiyun cbz has_nul1, .Lnul_in_data2 94*4882a593SmuzhiyunCPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun sub len, len, #8 97*4882a593Smuzhiyun mov has_nul2, has_nul1 98*4882a593Smuzhiyun.Lnul_in_data2: 99*4882a593Smuzhiyun /* 100*4882a593Smuzhiyun * For big-endian, carry propagation (if the final byte in the 101*4882a593Smuzhiyun * string is 0x01) means we cannot use has_nul directly. The 102*4882a593Smuzhiyun * easiest way to get the correct byte is to byte-swap the data 103*4882a593Smuzhiyun * and calculate the syndrome a second time. 104*4882a593Smuzhiyun */ 105*4882a593SmuzhiyunCPU_BE( rev data2, data2 ) 106*4882a593SmuzhiyunCPU_BE( sub tmp1, data2, zeroones ) 107*4882a593SmuzhiyunCPU_BE( orr tmp2, data2, #REP8_7f ) 108*4882a593SmuzhiyunCPU_BE( bic has_nul2, tmp1, tmp2 ) 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun sub len, len, #8 111*4882a593Smuzhiyun rev has_nul2, has_nul2 112*4882a593Smuzhiyun clz pos, has_nul2 113*4882a593Smuzhiyun add len, len, pos, lsr #3 /* Bits to bytes. */ 114*4882a593Smuzhiyun cmp len, limit 115*4882a593Smuzhiyun csel len, len, limit, ls /* Return the lower value. */ 116*4882a593Smuzhiyun ret 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun.Lmisaligned: 119*4882a593Smuzhiyun /* 120*4882a593Smuzhiyun * Deal with a partial first word. 121*4882a593Smuzhiyun * We're doing two things in parallel here; 122*4882a593Smuzhiyun * 1) Calculate the number of words (but avoiding overflow if 123*4882a593Smuzhiyun * limit is near ULONG_MAX) - to do this we need to work out 124*4882a593Smuzhiyun * limit + tmp1 - 1 as a 65-bit value before shifting it; 125*4882a593Smuzhiyun * 2) Load and mask the initial data words - we force the bytes 126*4882a593Smuzhiyun * before the ones we are interested in to 0xff - this ensures 127*4882a593Smuzhiyun * early bytes will not hit any zero detection. 128*4882a593Smuzhiyun */ 129*4882a593Smuzhiyun ldp data1, data2, [src], #16 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun sub limit_wd, limit, #1 132*4882a593Smuzhiyun and tmp3, limit_wd, #15 133*4882a593Smuzhiyun lsr limit_wd, limit_wd, #4 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun add tmp3, tmp3, tmp1 136*4882a593Smuzhiyun add limit_wd, limit_wd, tmp3, lsr #4 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun neg tmp4, tmp1 139*4882a593Smuzhiyun lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun mov tmp2, #~0 142*4882a593Smuzhiyun /* Big-endian. Early bytes are at MSB. */ 143*4882a593SmuzhiyunCPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ 144*4882a593Smuzhiyun /* Little-endian. Early bytes are at LSB. */ 145*4882a593SmuzhiyunCPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ 146*4882a593Smuzhiyun 147*4882a593Smuzhiyun cmp tmp1, #8 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun orr data1, data1, tmp2 150*4882a593Smuzhiyun orr data2a, data2, tmp2 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun csinv data1, data1, xzr, le 153*4882a593Smuzhiyun csel data2, data2, data2a, le 154*4882a593Smuzhiyun b .Lrealigned 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun.Lhit_limit: 157*4882a593Smuzhiyun mov len, limit 158*4882a593Smuzhiyun ret 159*4882a593SmuzhiyunSYM_FUNC_END_PI(strnlen) 160*4882a593SmuzhiyunEXPORT_SYMBOL_NOKASAN(strnlen) 161