1 #include <assert.h>
2
3 #include <libnu/defines.h>
4 #include <libnu/ducet.h>
5 #include <libnu/strcoll.h>
6 #include <libnu/strcoll_internal.h>
7
8 #if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION)
9
_compound_weight(int32_t w,const char ** encoded,const char * limit,nu_read_iterator_t read,nu_compound_read_t com,const char ** tail,nu_codepoint_weight_t weight,void * context)10 int32_t _compound_weight(int32_t w,
11 const char **encoded, const char *limit,
12 nu_read_iterator_t read, nu_compound_read_t com,
13 const char **tail,
14 nu_codepoint_weight_t weight, void *context) {
15
16 const char *tailp = *tail;
17
18 const char *p = *encoded;
19 int32_t new_w = w;
20 int32_t consumed = 1; /* one codepoint was consumed at the top of the stack (_nu_strcoll) */
21
22 while (p < limit) {
23 uint32_t u = 0;
24
25 const char *np = com(p, limit, read, &u, &tailp);
26 new_w = weight(u, &w, context);
27
28 /* after this point, w might hold rollback value
29 * and new_w holds actual weight */
30
31 ++consumed;
32
33 if (new_w >= 0) {
34 /* if w == 0 or w == 1, then *p or *np is already pointing
35 * to needed place, otherwise re-read encoded in the forward
36 * direction preserving correctness of tail pointer */
37 if (w != 0 && w != 1) {
38 assert(consumed + w > 1);
39
40 np = *encoded;
41 tailp = *tail;
42
43 for (int32_t i = 0; i < consumed - w; ++i) {
44 np = com(np, limit, read, 0, &tailp);
45 }
46
47 w = 0;
48 }
49
50 *encoded = (w == 0 ? np : p);
51 *tail = tailp;
52
53 break;
54 }
55
56 p = np;
57 w = new_w;
58 }
59
60 if (new_w < 0) {
61 new_w = weight(0, &w, context);
62 }
63
64 assert(new_w >= 0);
65
66 return new_w;
67 }
68
69 inline
_nu_strcoll(const char * lhs,const char * lhs_limit,const char * rhs,const char * rhs_limit,nu_read_iterator_t it1,nu_read_iterator_t it2,nu_compound_read_t com1,nu_compound_read_t com2,nu_codepoint_weight_t weight,void * context,ssize_t * collated_left,ssize_t * collated_right)70 int _nu_strcoll(const char *lhs, const char *lhs_limit,
71 const char *rhs, const char *rhs_limit,
72 nu_read_iterator_t it1, nu_read_iterator_t it2,
73 nu_compound_read_t com1, nu_compound_read_t com2,
74 nu_codepoint_weight_t weight, void *context,
75 ssize_t *collated_left, ssize_t *collated_right) {
76
77 int cmp = 0;
78
79 const char *lp = lhs, *rp = rhs;
80 const char *ltailp = 0, *rtailp = 0;
81
82 uint32_t u1 = 0, u2 = 0;
83
84 while ((lp < lhs_limit && rp < rhs_limit)
85 || (ltailp != 0 && rp < rhs_limit)
86 || (rtailp != 0 && lp < lhs_limit)) {
87
88 lp = com1(lp, lhs_limit, it1, &u1, <ailp);
89 rp = com2(rp, rhs_limit, it2, &u2, &rtailp);
90
91 #ifdef NU_DISABLE_CONTRACTIONS
92 /* if contractions are disabled, then same codepoints
93 * will produce same weights and there is no need
94 * to weight each, i.e. weight(u1) == weight(u2) and
95 * collation may proceed to next codepoints */
96 if (u1 != u2) {
97 #endif
98 int32_t w1 = weight(u1, 0, context);
99 int32_t w2 = weight(u2, 0, context);
100
101 if (w1 < 0) {
102 w1 = _compound_weight(w1, &lp, lhs_limit,
103 it1, com1, <ailp,
104 weight, context);
105 }
106
107 if (w2 < 0) {
108 w2 = _compound_weight(w2, &rp, rhs_limit,
109 it2, com2, &rtailp,
110 weight, context);
111 }
112
113 assert(w1 >= 0);
114 assert(w2 >= 0);
115
116 if (w1 < w2) {
117 cmp = -1;
118 break;
119 }
120 else if (w1 > w2) {
121 cmp = 1;
122 break;
123 }
124
125 #ifdef NU_DISABLE_CONTRACTIONS
126 }
127 #endif
128
129 if (u1 == 0 || u2 == 0) {
130 break;
131 }
132 }
133
134 /* collated_left and collated_right should count
135 * number of successfully collated bytes, not taking
136 * into account limits. therefore if cmp != 0,
137 * number of collated bytes is decreased by (at least) 1
138 * and cmp is limits-fixed afterwards */
139
140 if (collated_left != 0) {
141 *collated_left = (lp - lhs) - (cmp == 0 ? 0 : 1);
142 }
143
144 if (collated_right != 0) {
145 *collated_right = (rp - rhs) - (cmp == 0 ? 0 : 1);
146 }
147
148 if (cmp == 0) {
149 if (rp < rhs_limit && lp >= lhs_limit) {
150 cmp = -1;
151 }
152 else if (lp < lhs_limit && rp >= rhs_limit) {
153 cmp = 1;
154 }
155 }
156
157 return cmp;
158 }
159
160 inline
_nu_strchr(const char * lhs,const char * lhs_limit,uint32_t c,nu_read_iterator_t read,nu_compound_read_t com,nu_casemapping_t casemap,nu_read_iterator_t casemap_read)161 const char* _nu_strchr(const char *lhs, const char *lhs_limit,
162 uint32_t c, nu_read_iterator_t read,
163 nu_compound_read_t com,
164 nu_casemapping_t casemap, nu_read_iterator_t casemap_read) {
165
166 const char *p = lhs;
167 const char *tail = 0;
168 uint32_t u = 0;
169
170 const char *rhs = 0;
171
172 if (casemap != 0) {
173 rhs = casemap(c);
174 if (rhs != 0) {
175 rhs = casemap_read(rhs, &c); /* read new lead codepoint */
176 }
177 }
178
179 while (p < lhs_limit) {
180 const char *np = com(p, lhs_limit, read, &u, &tail);
181
182 if (u == 0) {
183 break;
184 }
185
186 if (u == c) {
187 if (rhs == 0) {
188 return p;
189 }
190
191 /* rhs != 0 */
192
193 const char *rp = rhs;
194 uint32_t u2 = 0;
195
196 do {
197 rp = casemap_read(rp, &u2);
198
199 if (u2 == 0) {
200 return p; /* succ exit point */
201 }
202
203 if (np >= lhs_limit) {
204 return 0;
205 }
206
207 np = com(np, lhs_limit, read, &u, &tail);
208
209 if (u == 0) {
210 return 0;
211 }
212
213 if (u != u2) {
214 break;
215 }
216 }
217 while (u2 != 0);
218 }
219
220 p = np;
221 }
222
223 return 0;
224 }
225
226 inline
_nu_strrchr(const char * encoded,const char * limit,uint32_t c,nu_read_iterator_t read,nu_compound_read_t com,nu_casemapping_t casemap,nu_read_iterator_t casemap_read)227 const char* _nu_strrchr(const char *encoded, const char *limit,
228 uint32_t c, nu_read_iterator_t read,
229 nu_compound_read_t com,
230 nu_casemapping_t casemap, nu_read_iterator_t casemap_read) {
231
232 /* there is probably not much sense in finding string end by decoding it
233 * and then reverse read string again to find last codepoint, therefore
234 * this is a sequence of _nu_strchr() in forward direction
235 *
236 * please let me know if i'm wrong */
237
238 const char *p = encoded;
239 const char *last = 0;
240
241 while (p < limit) {
242 p = _nu_strchr(p, limit, c, read, com, casemap, casemap_read);
243
244 if (p == 0) {
245 return last;
246 }
247
248 last = p;
249 p = read(p, 0); /* skip one codepoint and continue */
250 }
251
252 return last;
253 }
254
255 inline
_nu_strstr(const char * haystack,const char * haystack_limit,const char * needle,const char * needle_limit,nu_read_iterator_t it1,nu_read_iterator_t it2,nu_compound_read_t com1,nu_compound_read_t com2,nu_casemapping_t casemap,nu_read_iterator_t casemap_read,nu_codepoint_weight_t weight,void * context)256 const char* _nu_strstr(const char *haystack, const char *haystack_limit,
257 const char *needle, const char *needle_limit,
258 nu_read_iterator_t it1, nu_read_iterator_t it2,
259 nu_compound_read_t com1, nu_compound_read_t com2,
260 nu_casemapping_t casemap, nu_read_iterator_t casemap_read,
261 nu_codepoint_weight_t weight, void *context) {
262
263 uint32_t n0 = 0;
264 if (needle_limit != needle) {
265 it2(needle, &n0);
266 }
267
268 if (needle_limit == needle || n0 == 0) {
269 return haystack;
270 }
271
272 ssize_t needle_len = (needle_limit != NU_UNLIMITED
273 ? (needle_limit - needle)
274 : nu_strbytelen(needle, it2));
275
276 const char *h0 = haystack;
277 do {
278 h0 = _nu_strchr(h0, haystack_limit,
279 n0, it1,
280 com1,
281 casemap, casemap_read);
282
283 if (h0 == 0) {
284 break;
285 }
286
287 ssize_t collated_left = 0, collated_right = 0;
288 _nu_strcoll(h0, haystack_limit, needle, needle_limit,
289 it1, it2,
290 com1, com2,
291 weight, context,
292 &collated_left, &collated_right);
293
294 /* it doesn't matter what collate result is
295 * if whole needle was successfully collated */
296 if (collated_right >= needle_len) {
297 return h0;
298 }
299
300 /* skip one codepoint in haystack */
301 if (h0 < haystack_limit) {
302 h0 = it1(h0, 0);
303 }
304 }
305 while (h0 != 0 && h0 < haystack_limit);
306
307 return 0;
308 }
309
310 #ifdef NU_WITH_Z_COLLATION
311
nu_strchr(const char * encoded,uint32_t c,nu_read_iterator_t read)312 const char* nu_strchr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
313 return _nu_strchr(encoded, NU_UNLIMITED,
314 c, read,
315 nu_default_compound_read,
316 0, 0);
317 }
318
nu_strcasechr(const char * encoded,uint32_t c,nu_read_iterator_t read)319 const char* nu_strcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
320 return _nu_strchr(encoded, NU_UNLIMITED,
321 c, read,
322 nu_nocase_compound_read,
323 NU_FOLDING_FUNCTION, nu_casemap_read);
324 }
325
nu_strrchr(const char * encoded,uint32_t c,nu_read_iterator_t read)326 const char* nu_strrchr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
327 return _nu_strrchr(encoded, NU_UNLIMITED,
328 c, read,
329 nu_default_compound_read,
330 0, 0);
331 }
332
nu_strrcasechr(const char * encoded,uint32_t c,nu_read_iterator_t read)333 const char* nu_strrcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
334 return _nu_strrchr(encoded, NU_UNLIMITED, c, read,
335 nu_nocase_compound_read,
336 NU_FOLDING_FUNCTION, nu_casemap_read);
337 }
338
nu_strcoll(const char * s1,const char * s2,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)339 int nu_strcoll(const char *s1, const char *s2,
340 nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
341 return _nu_strcoll(s1, NU_UNLIMITED, s2, NU_UNLIMITED,
342 s1_read, s2_read,
343 nu_default_compound_read, nu_default_compound_read,
344 nu_ducet_weight, 0,
345 0, 0);
346 }
347
nu_strcasecoll(const char * s1,const char * s2,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)348 int nu_strcasecoll(const char *s1, const char *s2,
349 nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
350 return _nu_strcoll(s1, NU_UNLIMITED, s2, NU_UNLIMITED,
351 s1_read, s2_read,
352 nu_nocase_compound_read, nu_nocase_compound_read,
353 nu_ducet_weight, 0,
354 0, 0);
355 }
356
nu_strstr(const char * haystack,const char * needle,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)357 const char* nu_strstr(const char *haystack, const char *needle,
358 nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
359 return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED,
360 haystack_read, needle_read,
361 nu_default_compound_read, nu_default_compound_read,
362 0, 0,
363 nu_ducet_weight, 0);
364 }
365
nu_strcasestr(const char * haystack,const char * needle,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)366 const char* nu_strcasestr(const char *haystack, const char *needle,
367 nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
368 return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED,
369 haystack_read, needle_read,
370 nu_nocase_compound_read, nu_nocase_compound_read,
371 NU_FOLDING_FUNCTION, nu_casemap_read,
372 nu_ducet_weight, 0);
373 }
374
375 #endif /* NU_WITH_Z_COLLATION */
376
377 #ifdef NU_WITH_N_COLLATION
378
nu_strnchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)379 const char* nu_strnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
380 return _nu_strchr(encoded, encoded + max_len,
381 c, read,
382 nu_default_compound_read,
383 0, 0);
384 }
385
nu_strcasenchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)386 const char* nu_strcasenchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
387 return _nu_strchr(encoded, encoded + max_len,
388 c, read,
389 nu_nocase_compound_read,
390 NU_FOLDING_FUNCTION, nu_casemap_read);
391 }
392
nu_strrnchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)393 const char* nu_strrnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
394 return _nu_strrchr(encoded, encoded + max_len,
395 c, read,
396 nu_default_compound_read,
397 0, 0);
398 }
399
nu_strrcasenchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)400 const char* nu_strrcasenchr(const char *encoded, size_t max_len, uint32_t c,
401 nu_read_iterator_t read) {
402 return _nu_strrchr(encoded, encoded + max_len,
403 c, read,
404 nu_nocase_compound_read,
405 NU_FOLDING_FUNCTION, nu_casemap_read);
406 }
407
nu_strncoll(const char * s1,size_t s1_max_len,const char * s2,size_t s2_max_len,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)408 int nu_strncoll(const char *s1, size_t s1_max_len,
409 const char *s2, size_t s2_max_len,
410 nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
411 return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len,
412 s1_read, s2_read,
413 nu_default_compound_read, nu_default_compound_read,
414 nu_ducet_weight, 0,
415 0, 0);
416 }
417
nu_strcasencoll(const char * s1,size_t s1_max_len,const char * s2,size_t s2_max_len,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)418 int nu_strcasencoll(const char *s1, size_t s1_max_len,
419 const char *s2, size_t s2_max_len,
420 nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
421 return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len,
422 s1_read, s2_read,
423 nu_nocase_compound_read, nu_nocase_compound_read,
424 nu_ducet_weight, 0,
425 0, 0);
426 }
427
nu_strnstr(const char * haystack,size_t haystack_max_len,const char * needle,size_t needle_max_len,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)428 const char* nu_strnstr(const char *haystack, size_t haystack_max_len,
429 const char *needle, size_t needle_max_len,
430 nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
431 return _nu_strstr(haystack, haystack + haystack_max_len,
432 needle, needle + needle_max_len,
433 haystack_read, needle_read,
434 nu_default_compound_read, nu_default_compound_read,
435 0, 0,
436 nu_ducet_weight, 0);
437 }
438
nu_strcasenstr(const char * haystack,size_t haystack_max_len,const char * needle,size_t needle_max_len,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)439 const char* nu_strcasenstr(const char *haystack, size_t haystack_max_len,
440 const char *needle, size_t needle_max_len,
441 nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
442 return _nu_strstr(haystack, haystack + haystack_max_len,
443 needle, needle + needle_max_len,
444 haystack_read, needle_read,
445 nu_nocase_compound_read, nu_nocase_compound_read,
446 NU_FOLDING_FUNCTION, nu_casemap_read,
447 nu_ducet_weight, 0);
448 }
449
450 #endif /* NU_WITH_N_COLLATION */
451
452 #endif /* NU_WITH_Z_COLLATION || NU_WITH_N_COLLATION */
453