1 #include <assert.h>
2 
3 #include <libnu/defines.h>
4 #include <libnu/ducet.h>
5 #include <libnu/strcoll.h>
6 #include <libnu/strcoll_internal.h>
7 
8 #if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION)
9 
_compound_weight(int32_t w,const char ** encoded,const char * limit,nu_read_iterator_t read,nu_compound_read_t com,const char ** tail,nu_codepoint_weight_t weight,void * context)10 int32_t _compound_weight(int32_t w,
11 	const char **encoded, const char *limit,
12 	nu_read_iterator_t read, nu_compound_read_t com,
13 	const char **tail,
14 	nu_codepoint_weight_t weight, void *context) {
15 
16 	const char *tailp = *tail;
17 
18 	const char *p = *encoded;
19 	int32_t new_w = w;
20 	int32_t consumed = 1; /* one codepoint was consumed at the top of the stack (_nu_strcoll) */
21 
22 	while (p < limit) {
23 		uint32_t u = 0;
24 
25 		const char *np = com(p, limit, read, &u, &tailp);
26 		new_w = weight(u, &w, context);
27 
28 		/* after this point, w might hold rollback value
29 		 * and new_w holds actual weight */
30 
31 		++consumed;
32 
33 		if (new_w >= 0) {
34 			/* if w == 0 or w == 1, then *p or *np is already pointing
35 			 * to needed place, otherwise re-read encoded in the forward
36 			 * direction preserving correctness of tail pointer */
37 			if (w != 0 && w != 1) {
38 				assert(consumed + w > 1);
39 
40 				np = *encoded;
41 				tailp = *tail;
42 
43 				for (int32_t i = 0; i < consumed - w; ++i) {
44 					np = com(np, limit, read, 0, &tailp);
45 				}
46 
47 				w = 0;
48 			}
49 
50 			*encoded = (w == 0 ? np : p);
51 			*tail = tailp;
52 
53 			break;
54 		}
55 
56 		p = np;
57 		w = new_w;
58 	}
59 
60 	if (new_w < 0) {
61 		new_w = weight(0, &w, context);
62 	}
63 
64 	assert(new_w >= 0);
65 
66 	return new_w;
67 }
68 
69 inline
_nu_strcoll(const char * lhs,const char * lhs_limit,const char * rhs,const char * rhs_limit,nu_read_iterator_t it1,nu_read_iterator_t it2,nu_compound_read_t com1,nu_compound_read_t com2,nu_codepoint_weight_t weight,void * context,ssize_t * collated_left,ssize_t * collated_right)70 int _nu_strcoll(const char *lhs, const char *lhs_limit,
71 	const char *rhs, const char *rhs_limit,
72 	nu_read_iterator_t it1, nu_read_iterator_t it2,
73 	nu_compound_read_t com1, nu_compound_read_t com2,
74 	nu_codepoint_weight_t weight, void *context,
75 	ssize_t *collated_left, ssize_t *collated_right) {
76 
77 	int cmp = 0;
78 
79 	const char *lp = lhs, *rp = rhs;
80 	const char *ltailp = 0, *rtailp = 0;
81 
82 	uint32_t u1 = 0, u2 = 0;
83 
84 	while ((lp < lhs_limit && rp < rhs_limit)
85 		|| (ltailp != 0 && rp < rhs_limit)
86 		|| (rtailp != 0 && lp < lhs_limit)) {
87 
88 		lp = com1(lp, lhs_limit, it1, &u1, &ltailp);
89 		rp = com2(rp, rhs_limit, it2, &u2, &rtailp);
90 
91 #ifdef NU_DISABLE_CONTRACTIONS
92 		/* if contractions are disabled, then same codepoints
93 		 * will produce same weights and there is no need
94 		 * to weight each, i.e. weight(u1) == weight(u2) and
95 		 * collation may proceed to next codepoints */
96 		if (u1 != u2) {
97 #endif
98 			int32_t w1 = weight(u1, 0, context);
99 			int32_t w2 = weight(u2, 0, context);
100 
101 			if (w1 < 0) {
102 				w1 = _compound_weight(w1, &lp, lhs_limit,
103 					it1, com1, &ltailp,
104 					weight, context);
105 			}
106 
107 			if (w2 < 0) {
108 				w2 = _compound_weight(w2, &rp, rhs_limit,
109 					it2, com2, &rtailp,
110 					weight, context);
111 			}
112 
113 			assert(w1 >= 0);
114 			assert(w2 >= 0);
115 
116 			if (w1 < w2) {
117 				cmp = -1;
118 				break;
119 			}
120 			else if (w1 > w2) {
121 				cmp = 1;
122 				break;
123 			}
124 
125 #ifdef NU_DISABLE_CONTRACTIONS
126 		}
127 #endif
128 
129 		if (u1 == 0 || u2 == 0) {
130 			break;
131 		}
132 	}
133 
134 	/* collated_left and collated_right should count
135 	 * number of successfully collated bytes, not taking
136 	 * into account limits. therefore if cmp != 0,
137 	 * number of collated bytes is decreased by (at least) 1
138 	 * and cmp is limits-fixed afterwards */
139 
140 	if (collated_left != 0) {
141 		*collated_left = (lp - lhs) - (cmp == 0 ? 0 : 1);
142 	}
143 
144 	if (collated_right != 0) {
145 		*collated_right = (rp - rhs) - (cmp == 0 ? 0 : 1);
146 	}
147 
148 	if (cmp == 0) {
149 		if (rp < rhs_limit && lp >= lhs_limit) {
150 			cmp = -1;
151 		}
152 		else if (lp < lhs_limit && rp >= rhs_limit) {
153 			cmp = 1;
154 		}
155 	}
156 
157 	return cmp;
158 }
159 
160 inline
_nu_strchr(const char * lhs,const char * lhs_limit,uint32_t c,nu_read_iterator_t read,nu_compound_read_t com,nu_casemapping_t casemap,nu_read_iterator_t casemap_read)161 const char* _nu_strchr(const char *lhs, const char *lhs_limit,
162 	uint32_t c, nu_read_iterator_t read,
163 	nu_compound_read_t com,
164 	nu_casemapping_t casemap, nu_read_iterator_t casemap_read) {
165 
166 	const char *p = lhs;
167 	const char *tail = 0;
168 	uint32_t u = 0;
169 
170 	const char *rhs = 0;
171 
172 	if (casemap != 0) {
173 		rhs = casemap(c);
174 		if (rhs != 0) {
175 			rhs = casemap_read(rhs, &c); /* read new lead codepoint */
176 		}
177 	}
178 
179 	while (p < lhs_limit) {
180 		const char *np = com(p, lhs_limit, read, &u, &tail);
181 
182 		if (u == 0) {
183 			break;
184 		}
185 
186 		if (u == c) {
187 			if (rhs == 0) {
188 				return p;
189 			}
190 
191 			/* rhs != 0 */
192 
193 			const char *rp = rhs;
194 			uint32_t u2 = 0;
195 
196 			do {
197 				rp = casemap_read(rp, &u2);
198 
199 				if (u2 == 0) {
200 					return p; /* succ exit point */
201 				}
202 
203 				if (np >= lhs_limit) {
204 					return 0;
205 				}
206 
207 				np = com(np, lhs_limit, read, &u, &tail);
208 
209 				if (u == 0) {
210 					return 0;
211 				}
212 
213 				if (u != u2) {
214 					break;
215 				}
216 			}
217 			while (u2 != 0);
218 		}
219 
220 		p = np;
221 	}
222 
223 	return 0;
224 }
225 
226 inline
_nu_strrchr(const char * encoded,const char * limit,uint32_t c,nu_read_iterator_t read,nu_compound_read_t com,nu_casemapping_t casemap,nu_read_iterator_t casemap_read)227 const char* _nu_strrchr(const char *encoded, const char *limit,
228 	uint32_t c, nu_read_iterator_t read,
229 	nu_compound_read_t com,
230 	nu_casemapping_t casemap, nu_read_iterator_t casemap_read) {
231 
232 	/* there is probably not much sense in finding string end by decoding it
233 	 * and then reverse read string again to find last codepoint, therefore
234 	 * this is a sequence of _nu_strchr() in forward direction
235 	 *
236 	 * please let me know if i'm wrong */
237 
238 	const char *p = encoded;
239 	const char *last = 0;
240 
241 	while (p < limit) {
242 		p = _nu_strchr(p, limit, c, read, com, casemap, casemap_read);
243 
244 		if (p == 0) {
245 			return last;
246 		}
247 
248 		last = p;
249 		p = read(p, 0); /* skip one codepoint and continue */
250 	}
251 
252 	return last;
253 }
254 
255 inline
_nu_strstr(const char * haystack,const char * haystack_limit,const char * needle,const char * needle_limit,nu_read_iterator_t it1,nu_read_iterator_t it2,nu_compound_read_t com1,nu_compound_read_t com2,nu_casemapping_t casemap,nu_read_iterator_t casemap_read,nu_codepoint_weight_t weight,void * context)256 const char* _nu_strstr(const char *haystack, const char *haystack_limit,
257 	const char *needle, const char *needle_limit,
258 	nu_read_iterator_t it1, nu_read_iterator_t it2,
259 	nu_compound_read_t com1, nu_compound_read_t com2,
260 	nu_casemapping_t casemap, nu_read_iterator_t casemap_read,
261 	nu_codepoint_weight_t weight, void *context) {
262 
263 	uint32_t n0 = 0;
264 	if (needle_limit != needle) {
265 		it2(needle, &n0);
266 	}
267 
268 	if (needle_limit == needle || n0 == 0) {
269 		return haystack;
270 	}
271 
272 	ssize_t needle_len = (needle_limit != NU_UNLIMITED
273 		? (needle_limit - needle)
274 		: nu_strbytelen(needle, it2));
275 
276 	const char *h0 = haystack;
277 	do {
278 		h0 = _nu_strchr(h0, haystack_limit,
279 			n0, it1,
280 			com1,
281 			casemap, casemap_read);
282 
283 		if (h0 == 0) {
284 			break;
285 		}
286 
287 		ssize_t collated_left = 0, collated_right = 0;
288 		_nu_strcoll(h0, haystack_limit, needle, needle_limit,
289 			it1, it2,
290 			com1, com2,
291 			weight, context,
292 			&collated_left, &collated_right);
293 
294 		/* it doesn't matter what collate result is
295 		 * if whole needle was successfully collated */
296 		if (collated_right >= needle_len) {
297 			return h0;
298 		}
299 
300 		/* skip one codepoint in haystack */
301 		if (h0 < haystack_limit) {
302 			h0 = it1(h0, 0);
303 		}
304 	}
305 	while (h0 != 0 && h0 < haystack_limit);
306 
307 	return 0;
308 }
309 
310 #ifdef NU_WITH_Z_COLLATION
311 
nu_strchr(const char * encoded,uint32_t c,nu_read_iterator_t read)312 const char* nu_strchr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
313 	return _nu_strchr(encoded, NU_UNLIMITED,
314 		c, read,
315 		nu_default_compound_read,
316 		0, 0);
317 }
318 
nu_strcasechr(const char * encoded,uint32_t c,nu_read_iterator_t read)319 const char* nu_strcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
320 	return _nu_strchr(encoded, NU_UNLIMITED,
321 		c, read,
322 		nu_nocase_compound_read,
323 		NU_FOLDING_FUNCTION, nu_casemap_read);
324 }
325 
nu_strrchr(const char * encoded,uint32_t c,nu_read_iterator_t read)326 const char* nu_strrchr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
327 	return _nu_strrchr(encoded, NU_UNLIMITED,
328 		c, read,
329 		nu_default_compound_read,
330 		0, 0);
331 }
332 
nu_strrcasechr(const char * encoded,uint32_t c,nu_read_iterator_t read)333 const char* nu_strrcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) {
334 	return _nu_strrchr(encoded, NU_UNLIMITED, c, read,
335 		nu_nocase_compound_read,
336 		NU_FOLDING_FUNCTION, nu_casemap_read);
337 }
338 
nu_strcoll(const char * s1,const char * s2,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)339 int nu_strcoll(const char *s1, const char *s2,
340 	nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
341 	return _nu_strcoll(s1, NU_UNLIMITED, s2, NU_UNLIMITED,
342 		s1_read, s2_read,
343 		nu_default_compound_read, nu_default_compound_read,
344 		nu_ducet_weight, 0,
345 		0, 0);
346 }
347 
nu_strcasecoll(const char * s1,const char * s2,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)348 int nu_strcasecoll(const char *s1, const char *s2,
349 	nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
350 	return _nu_strcoll(s1, NU_UNLIMITED, s2, NU_UNLIMITED,
351 		s1_read, s2_read,
352 		nu_nocase_compound_read, nu_nocase_compound_read,
353 		nu_ducet_weight, 0,
354 		0, 0);
355 }
356 
nu_strstr(const char * haystack,const char * needle,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)357 const char* nu_strstr(const char *haystack, const char *needle,
358 	nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
359 	return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED,
360 		haystack_read, needle_read,
361 		nu_default_compound_read, nu_default_compound_read,
362 		0, 0,
363 		nu_ducet_weight, 0);
364 }
365 
nu_strcasestr(const char * haystack,const char * needle,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)366 const char* nu_strcasestr(const char *haystack, const char *needle,
367 	nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
368 	return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED,
369 		haystack_read, needle_read,
370 		nu_nocase_compound_read, nu_nocase_compound_read,
371 		NU_FOLDING_FUNCTION, nu_casemap_read,
372 		nu_ducet_weight, 0);
373 }
374 
375 #endif /* NU_WITH_Z_COLLATION */
376 
377 #ifdef NU_WITH_N_COLLATION
378 
nu_strnchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)379 const char* nu_strnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
380 	return _nu_strchr(encoded, encoded + max_len,
381 		c, read,
382 		nu_default_compound_read,
383 		0, 0);
384 }
385 
nu_strcasenchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)386 const char* nu_strcasenchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
387 	return _nu_strchr(encoded, encoded + max_len,
388 		c, read,
389 		nu_nocase_compound_read,
390 		NU_FOLDING_FUNCTION, nu_casemap_read);
391 }
392 
nu_strrnchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)393 const char* nu_strrnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) {
394 	return _nu_strrchr(encoded, encoded + max_len,
395 		c, read,
396 		nu_default_compound_read,
397 		0, 0);
398 }
399 
nu_strrcasenchr(const char * encoded,size_t max_len,uint32_t c,nu_read_iterator_t read)400 const char* nu_strrcasenchr(const char *encoded, size_t max_len, uint32_t c,
401 	nu_read_iterator_t read) {
402 	return _nu_strrchr(encoded, encoded + max_len,
403 		c, read,
404 		nu_nocase_compound_read,
405 		NU_FOLDING_FUNCTION, nu_casemap_read);
406 }
407 
nu_strncoll(const char * s1,size_t s1_max_len,const char * s2,size_t s2_max_len,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)408 int nu_strncoll(const char *s1, size_t s1_max_len,
409 	const char *s2, size_t s2_max_len,
410 	nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
411 	return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len,
412 		s1_read, s2_read,
413 		nu_default_compound_read, nu_default_compound_read,
414 		nu_ducet_weight, 0,
415 		0, 0);
416 }
417 
nu_strcasencoll(const char * s1,size_t s1_max_len,const char * s2,size_t s2_max_len,nu_read_iterator_t s1_read,nu_read_iterator_t s2_read)418 int nu_strcasencoll(const char *s1, size_t s1_max_len,
419 	const char *s2, size_t s2_max_len,
420 	nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) {
421 	return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len,
422 		s1_read, s2_read,
423 		nu_nocase_compound_read, nu_nocase_compound_read,
424 		nu_ducet_weight, 0,
425 		0, 0);
426 }
427 
nu_strnstr(const char * haystack,size_t haystack_max_len,const char * needle,size_t needle_max_len,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)428 const char* nu_strnstr(const char *haystack, size_t haystack_max_len,
429 	const char *needle, size_t needle_max_len,
430 	nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
431 	return _nu_strstr(haystack,  haystack + haystack_max_len,
432 		needle, needle + needle_max_len,
433 		haystack_read, needle_read,
434 		nu_default_compound_read, nu_default_compound_read,
435 		0, 0,
436 		nu_ducet_weight, 0);
437 }
438 
nu_strcasenstr(const char * haystack,size_t haystack_max_len,const char * needle,size_t needle_max_len,nu_read_iterator_t haystack_read,nu_read_iterator_t needle_read)439 const char* nu_strcasenstr(const char *haystack, size_t haystack_max_len,
440 	const char *needle, size_t needle_max_len,
441 	nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) {
442 	return _nu_strstr(haystack,  haystack + haystack_max_len,
443 		needle, needle + needle_max_len,
444 		haystack_read, needle_read,
445 		nu_nocase_compound_read, nu_nocase_compound_read,
446 		NU_FOLDING_FUNCTION, nu_casemap_read,
447 		nu_ducet_weight, 0);
448 }
449 
450 #endif /* NU_WITH_N_COLLATION */
451 
452 #endif /* NU_WITH_Z_COLLATION || NU_WITH_N_COLLATION */
453