1 /* SPDX-License-Identifier: BSD-3-Clause */ 2 /* 3 * Copyright (c) 1994-2009 Red Hat, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, 10 * this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * 3. Neither the name of the copyright holder nor the names of its 17 * contributors may be used to endorse or promote products derived from this 18 * software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* Byte-wise substring search, using the Two-Way algorithm. 34 * Copyright (C) 2008, 2010 Eric Blake 35 * Permission to use, copy, modify, and distribute this software 36 * is freely granted, provided that this notice is preserved. 37 */ 38 39 40 /* Before including this file, you need to include <string.h>, and define: 41 RESULT_TYPE A macro that expands to the return type. 42 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are 43 at least N_L bytes left starting at 44 H[J]. H is 'unsigned char *', H_L, J, 45 and N_L are 'size_t'; H_L is an 46 lvalue. For NUL-terminated searches, 47 H_L can be modified each iteration to 48 avoid having to compute the end of H 49 up front. 50 51 For case-insensitivity, you may optionally define: 52 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L 53 characters of P1 and P2 are equal. 54 CANON_ELEMENT(c) A macro that canonicalizes an element 55 right after it has been fetched from 56 one of the two strings. The argument 57 is an 'unsigned char'; the result must 58 be an 'unsigned char' as well. 59 60 This file undefines the macros documented above, and defines 61 LONG_NEEDLE_THRESHOLD. 62 */ 63 64 #include <limits.h> 65 #include <stdint.h> 66 67 /* We use the Two-Way string matching algorithm, which guarantees 68 linear complexity with constant space. Additionally, for long 69 needles, we also use a bad character shift table similar to the 70 Boyer-Moore algorithm to achieve improved (potentially sub-linear) 71 performance. 72 73 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 74 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm 75 */ 76 77 /* Point at which computing a bad-byte shift table is likely to be 78 worthwhile. Small needles should not compute a table, since it 79 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a 80 speedup no greater than a factor of NEEDLE_LEN. The larger the 81 needle, the better the potential performance gain. On the other 82 hand, on non-POSIX systems with CHAR_BIT larger than eight, the 83 memory required for the table is prohibitive. */ 84 #if CHAR_BIT < 10 85 # define LONG_NEEDLE_THRESHOLD 32U 86 #else 87 # define LONG_NEEDLE_THRESHOLD SIZE_MAX 88 #endif 89 90 #define MAX(a, b) ((a < b) ? (b) : (a)) 91 92 #ifndef CANON_ELEMENT 93 # define CANON_ELEMENT(c) c 94 #endif 95 #ifndef CMP_FUNC 96 # define CMP_FUNC memcmp 97 #endif 98 99 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. 100 Return the index of the first byte in the right half, and set 101 *PERIOD to the global period of the right half. 102 103 The global period of a string is the smallest index (possibly its 104 length) at which all remaining bytes in the string are repetitions 105 of the prefix (the last repetition may be a subset of the prefix). 106 107 When NEEDLE is factored into two halves, a local period is the 108 length of the smallest word that shares a suffix with the left half 109 and shares a prefix with the right half. All factorizations of a 110 non-empty NEEDLE have a local period of at least 1 and no greater 111 than NEEDLE_LEN. 112 113 A critical factorization has the property that the local period 114 equals the global period. All strings have at least one critical 115 factorization with the left half smaller than the global period. 116 117 Given an ordered alphabet, a critical factorization can be computed 118 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 119 larger of two ordered maximal suffixes. The ordered maximal 120 suffixes are determined by lexicographic comparison of 121 periodicity. */ 122 static size_t 123 critical_factorization (const unsigned char *needle, size_t needle_len, 124 size_t *period) 125 { 126 /* Index of last byte of left half, or SIZE_MAX. */ 127 size_t max_suffix, max_suffix_rev; 128 size_t j; /* Index into NEEDLE for current candidate suffix. */ 129 size_t k; /* Offset into current period. */ 130 size_t p; /* Intermediate period. */ 131 unsigned char a, b; /* Current comparison bytes. */ 132 133 /* Invariants: 134 0 <= j < NEEDLE_LEN - 1 135 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 136 min(max_suffix, max_suffix_rev) < global period of NEEDLE 137 1 <= p <= global period of NEEDLE 138 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 139 1 <= k <= p 140 */ 141 142 /* Perform lexicographic search. */ 143 max_suffix = SIZE_MAX; 144 j = 0; 145 k = p = 1; 146 while (j + k < needle_len) 147 { 148 a = CANON_ELEMENT (needle[j + k]); 149 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]); 150 if (a < b) 151 { 152 /* Suffix is smaller, period is entire prefix so far. */ 153 j += k; 154 k = 1; 155 p = j - max_suffix; 156 } 157 else if (a == b) 158 { 159 /* Advance through repetition of the current period. */ 160 if (k != p) 161 ++k; 162 else 163 { 164 j += p; 165 k = 1; 166 } 167 } 168 else /* b < a */ 169 { 170 /* Suffix is larger, start over from current location. */ 171 max_suffix = j++; 172 k = p = 1; 173 } 174 } 175 *period = p; 176 177 /* Perform reverse lexicographic search. */ 178 max_suffix_rev = SIZE_MAX; 179 j = 0; 180 k = p = 1; 181 while (j + k < needle_len) 182 { 183 a = CANON_ELEMENT (needle[j + k]); 184 b = CANON_ELEMENT (needle[max_suffix_rev + k]); 185 if (b < a) 186 { 187 /* Suffix is smaller, period is entire prefix so far. */ 188 j += k; 189 k = 1; 190 p = j - max_suffix_rev; 191 } 192 else if (a == b) 193 { 194 /* Advance through repetition of the current period. */ 195 if (k != p) 196 ++k; 197 else 198 { 199 j += p; 200 k = 1; 201 } 202 } 203 else /* a < b */ 204 { 205 /* Suffix is larger, start over from current location. */ 206 max_suffix_rev = j++; 207 k = p = 1; 208 } 209 } 210 211 /* Choose the longer suffix. Return the first byte of the right 212 half, rather than the last byte of the left half. */ 213 if (max_suffix_rev + 1 < max_suffix + 1) 214 return max_suffix + 1; 215 *period = p; 216 return max_suffix_rev + 1; 217 } 218 219 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 220 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 221 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. 222 Performance is guaranteed to be linear, with an initialization cost 223 of 2 * NEEDLE_LEN comparisons. 224 225 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 226 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. 227 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 228 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ 229 static RETURN_TYPE 230 two_way_short_needle (const unsigned char *haystack, size_t haystack_len, 231 const unsigned char *needle, size_t needle_len) 232 { 233 size_t i; /* Index into current byte of NEEDLE. */ 234 size_t j; /* Index into current window of HAYSTACK. */ 235 size_t period; /* The period of the right half of needle. */ 236 size_t suffix; /* The index of the right half of needle. */ 237 238 /* Factor the needle into two halves, such that the left half is 239 smaller than the global period, and the right half is 240 periodic (with a period as large as NEEDLE_LEN - suffix). */ 241 suffix = critical_factorization (needle, needle_len, &period); 242 243 /* Perform the search. Each iteration compares the right half 244 first. */ 245 if (CMP_FUNC (needle, needle + period, suffix) == 0) 246 { 247 /* Entire needle is periodic; a mismatch can only advance by the 248 period, so use memory to avoid rescanning known occurrences 249 of the period. */ 250 size_t memory = 0; 251 j = 0; 252 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 253 { 254 /* Scan for matches in right half. */ 255 i = MAX (suffix, memory); 256 while (i < needle_len && (CANON_ELEMENT (needle[i]) 257 == CANON_ELEMENT (haystack[i + j]))) 258 ++i; 259 if (needle_len <= i) 260 { 261 /* Scan for matches in left half. */ 262 i = suffix - 1; 263 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 264 == CANON_ELEMENT (haystack[i + j]))) 265 --i; 266 if (i + 1 < memory + 1) 267 return (RETURN_TYPE) (haystack + j); 268 /* No match, so remember how many repetitions of period 269 on the right half were scanned. */ 270 j += period; 271 memory = needle_len - period; 272 } 273 else 274 { 275 j += i - suffix + 1; 276 memory = 0; 277 } 278 } 279 } 280 else 281 { 282 /* The two halves of needle are distinct; no extra memory is 283 required, and any mismatch results in a maximal shift. */ 284 period = MAX (suffix, needle_len - suffix) + 1; 285 j = 0; 286 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 287 { 288 /* Scan for matches in right half. */ 289 i = suffix; 290 while (i < needle_len && (CANON_ELEMENT (needle[i]) 291 == CANON_ELEMENT (haystack[i + j]))) 292 ++i; 293 if (needle_len <= i) 294 { 295 /* Scan for matches in left half. */ 296 i = suffix - 1; 297 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 298 == CANON_ELEMENT (haystack[i + j]))) 299 --i; 300 if (i == SIZE_MAX) 301 return (RETURN_TYPE) (haystack + j); 302 j += period; 303 } 304 else 305 j += i - suffix + 1; 306 } 307 } 308 return NULL; 309 } 310 311 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 312 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 313 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. 314 Performance is guaranteed to be linear, with an initialization cost 315 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. 316 317 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 318 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, 319 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. 320 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 321 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and 322 sublinear performance is not possible. */ 323 static RETURN_TYPE 324 two_way_long_needle (const unsigned char *haystack, size_t haystack_len, 325 const unsigned char *needle, size_t needle_len) 326 { 327 size_t i; /* Index into current byte of NEEDLE. */ 328 size_t j; /* Index into current window of HAYSTACK. */ 329 size_t period; /* The period of the right half of needle. */ 330 size_t suffix; /* The index of the right half of needle. */ 331 size_t shift_table[1U << CHAR_BIT]; /* See below. */ 332 333 /* Factor the needle into two halves, such that the left half is 334 smaller than the global period, and the right half is 335 periodic (with a period as large as NEEDLE_LEN - suffix). */ 336 suffix = critical_factorization (needle, needle_len, &period); 337 338 /* Populate shift_table. For each possible byte value c, 339 shift_table[c] is the distance from the last occurrence of c to 340 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. 341 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ 342 for (i = 0; i < 1U << CHAR_BIT; i++) 343 shift_table[i] = needle_len; 344 for (i = 0; i < needle_len; i++) 345 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; 346 347 /* Perform the search. Each iteration compares the right half 348 first. */ 349 if (CMP_FUNC (needle, needle + period, suffix) == 0) 350 { 351 /* Entire needle is periodic; a mismatch can only advance by the 352 period, so use memory to avoid rescanning known occurrences 353 of the period. */ 354 size_t memory = 0; 355 size_t shift; 356 j = 0; 357 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 358 { 359 /* Check the last byte first; if it does not match, then 360 shift to the next possible match location. */ 361 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 362 if (0 < shift) 363 { 364 if (memory && shift < period) 365 { 366 /* Since needle is periodic, but the last period has 367 a byte out of place, there can be no match until 368 after the mismatch. */ 369 shift = needle_len - period; 370 } 371 memory = 0; 372 j += shift; 373 continue; 374 } 375 /* Scan for matches in right half. The last byte has 376 already been matched, by virtue of the shift table. */ 377 i = MAX (suffix, memory); 378 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 379 == CANON_ELEMENT (haystack[i + j]))) 380 ++i; 381 if (needle_len - 1 <= i) 382 { 383 /* Scan for matches in left half. */ 384 i = suffix - 1; 385 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 386 == CANON_ELEMENT (haystack[i + j]))) 387 --i; 388 if (i + 1 < memory + 1) 389 return (RETURN_TYPE) (haystack + j); 390 /* No match, so remember how many repetitions of period 391 on the right half were scanned. */ 392 j += period; 393 memory = needle_len - period; 394 } 395 else 396 { 397 j += i - suffix + 1; 398 memory = 0; 399 } 400 } 401 } 402 else 403 { 404 /* The two halves of needle are distinct; no extra memory is 405 required, and any mismatch results in a maximal shift. */ 406 size_t shift; 407 period = MAX (suffix, needle_len - suffix) + 1; 408 j = 0; 409 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 410 { 411 /* Check the last byte first; if it does not match, then 412 shift to the next possible match location. */ 413 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 414 if (0 < shift) 415 { 416 j += shift; 417 continue; 418 } 419 /* Scan for matches in right half. The last byte has 420 already been matched, by virtue of the shift table. */ 421 i = suffix; 422 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 423 == CANON_ELEMENT (haystack[i + j]))) 424 ++i; 425 if (needle_len - 1 <= i) 426 { 427 /* Scan for matches in left half. */ 428 i = suffix - 1; 429 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 430 == CANON_ELEMENT (haystack[i + j]))) 431 --i; 432 if (i == SIZE_MAX) 433 return (RETURN_TYPE) (haystack + j); 434 j += period; 435 } 436 else 437 j += i - suffix + 1; 438 } 439 } 440 return NULL; 441 } 442 443 #undef AVAILABLE 444 #undef CANON_ELEMENT 445 #undef CMP_FUNC 446 #undef MAX 447 #undef RETURN_TYPE 448