1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include "../include/lpicache.h"
22 #include "../include/matrixsearch.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25 #include "../include/userdict.h"
26
27 namespace ime_pinyin {
28
29 #define PRUMING_SCORE 8000.0
30
MatrixSearch()31 MatrixSearch::MatrixSearch() {
32 inited_ = false;
33 spl_trie_ = SpellingTrie::get_cpinstance();
34
35 reset_pointers_to_null();
36
37 pys_decoded_len_ = 0;
38 mtrx_nd_pool_used_ = 0;
39 dmi_pool_used_ = 0;
40 xi_an_enabled_ = false;
41 dmi_c_phrase_ = false;
42
43 assert(kMaxSearchSteps > 0);
44 max_sps_len_ = kMaxSearchSteps - 1;
45 max_hzs_len_ = kMaxSearchSteps;
46 }
47
~MatrixSearch()48 MatrixSearch::~MatrixSearch() {
49 free_resource();
50 }
51
reset_pointers_to_null()52 void MatrixSearch::reset_pointers_to_null() {
53 dict_trie_ = NULL;
54 user_dict_ = NULL;
55 spl_parser_ = NULL;
56
57 share_buf_ = NULL;
58
59 // The following four buffers are used for decoding, and they are based on
60 // share_buf_, no need to delete them.
61 mtrx_nd_pool_ = NULL;
62 dmi_pool_ = NULL;
63 matrix_ = NULL;
64 dep_ = NULL;
65
66 // Based on share_buf_, no need to delete them.
67 npre_items_ = NULL;
68 }
69
alloc_resource()70 bool MatrixSearch::alloc_resource() {
71 free_resource();
72
73 dict_trie_ = new DictTrie();
74 user_dict_ = static_cast<AtomDictBase*>(new UserDict());
75 spl_parser_ = new SpellingParser();
76
77 size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
78 mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
79 size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
80 dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
81 size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
82 matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
83 size_t dep_size = sizeof(DictExtPara);
84 dep_size = align_to_size_t(dep_size) / sizeof(size_t);
85
86 // share_buf's size is determined by the buffers for search.
87 share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
88
89 if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
90 NULL == share_buf_)
91 return false;
92
93 // The buffers for search are based on the share buffer
94 mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
95 dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
96 matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
97 dep_ = reinterpret_cast<DictExtPara*>
98 (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
99
100 // The prediction buffer is also based on the share buffer.
101 npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
102 npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
103 sizeof(size_t) / sizeof(NPredictItem);
104 return true;
105 }
106
free_resource()107 void MatrixSearch::free_resource() {
108 if (NULL != dict_trie_)
109 delete dict_trie_;
110
111 if (NULL != user_dict_)
112 delete user_dict_;
113
114 if (NULL != spl_parser_)
115 delete spl_parser_;
116
117 if (NULL != share_buf_)
118 delete [] share_buf_;
119
120 reset_pointers_to_null();
121 }
122
init(const char * fn_sys_dict,const char * fn_usr_dict)123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
124 if (NULL == fn_sys_dict || NULL == fn_usr_dict)
125 return false;
126
127 if (!alloc_resource())
128 return false;
129
130 if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
131 return false;
132
133 // If engine fails to load the user dictionary, reset the user dictionary
134 // to NULL.
135 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
136 delete user_dict_;
137 user_dict_ = NULL;
138 } else{
139 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
140 }
141
142 reset_search0();
143
144 inited_ = true;
145 return true;
146 }
147
init_fd(int sys_fd,long start_offset,long length,const char * fn_usr_dict)148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
149 const char *fn_usr_dict) {
150 if (NULL == fn_usr_dict)
151 return false;
152
153 if (!alloc_resource())
154 return false;
155
156 if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
157 return false;
158
159 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
160 delete user_dict_;
161 user_dict_ = NULL;
162 } else {
163 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
164 }
165
166 reset_search0();
167
168 inited_ = true;
169 return true;
170 }
171
init_user_dictionary(const char * fn_usr_dict)172 void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) {
173 assert(inited_);
174
175 if (NULL != user_dict_) {
176 delete user_dict_;
177 user_dict_ = NULL;
178 }
179
180 if (NULL != fn_usr_dict) {
181 user_dict_ = static_cast<AtomDictBase*>(new UserDict());
182 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
183 delete user_dict_;
184 user_dict_ = NULL;
185 }
186 }
187
188 reset_search0();
189 }
190
is_user_dictionary_enabled() const191 bool MatrixSearch::is_user_dictionary_enabled() const {
192 return NULL != user_dict_;
193 }
194
set_max_lens(size_t max_sps_len,size_t max_hzs_len)195 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
196 if (0 != max_sps_len)
197 max_sps_len_ = max_sps_len;
198 if (0 != max_hzs_len)
199 max_hzs_len_ = max_hzs_len;
200 }
201
close()202 void MatrixSearch::close() {
203 flush_cache();
204 free_resource();
205 inited_ = false;
206 }
207
flush_cache()208 void MatrixSearch::flush_cache() {
209 if (NULL != user_dict_)
210 user_dict_->flush_cache();
211 }
212
set_xi_an_switch(bool xi_an_enabled)213 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
214 xi_an_enabled_ = xi_an_enabled;
215 }
216
get_xi_an_switch()217 bool MatrixSearch::get_xi_an_switch() {
218 return xi_an_enabled_;
219 }
220
reset_search()221 bool MatrixSearch::reset_search() {
222 if (!inited_)
223 return false;
224 return reset_search0();
225 }
226
reset_search0()227 bool MatrixSearch::reset_search0() {
228 if (!inited_)
229 return false;
230
231 pys_decoded_len_ = 0;
232 mtrx_nd_pool_used_ = 0;
233 dmi_pool_used_ = 0;
234
235 // Get a MatrixNode from the pool
236 matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
237 matrix_[0].mtrx_nd_num = 1;
238 mtrx_nd_pool_used_ += 1;
239
240 // Update the node, and make it to be a starting node
241 MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
242 node->id = 0;
243 node->score = 0;
244 node->from = NULL;
245 node->step = 0;
246 node->dmi_fr = (PoolPosType)-1;
247
248 matrix_[0].dmi_pos = 0;
249 matrix_[0].dmi_num = 0;
250 matrix_[0].dmi_has_full_id = 1;
251 matrix_[0].mtrx_nd_fixed = node;
252
253 lma_start_[0] = 0;
254 fixed_lmas_ = 0;
255 spl_start_[0] = 0;
256 fixed_hzs_ = 0;
257
258 dict_trie_->reset_milestones(0, 0);
259 if (NULL != user_dict_)
260 user_dict_->reset_milestones(0, 0);
261
262 return true;
263 }
264
reset_search(size_t ch_pos,bool clear_fixed_this_step,bool clear_dmi_this_step,bool clear_mtrx_this_step)265 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
266 bool clear_dmi_this_step,
267 bool clear_mtrx_this_step) {
268 if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
269 return false;
270
271 if (0 == ch_pos) {
272 reset_search0();
273 } else {
274 // Prepare mile stones of this step to clear.
275 MileStoneHandle *dict_handles_to_clear = NULL;
276 if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
277 dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
278 }
279
280 // If there are more steps, and this step is not allowed to clear, find
281 // milestones of next step.
282 if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
283 dict_handles_to_clear = NULL;
284 if (matrix_[ch_pos + 1].dmi_num > 0) {
285 dict_handles_to_clear =
286 dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
287 }
288 }
289
290 if (NULL != dict_handles_to_clear) {
291 dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
292 if (NULL != user_dict_)
293 user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
294 }
295
296 pys_decoded_len_ = ch_pos;
297
298 if (clear_dmi_this_step) {
299 dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
300 + matrix_[ch_pos - 1].dmi_num;
301 matrix_[ch_pos].dmi_num = 0;
302 } else {
303 dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
304 }
305
306 if (clear_mtrx_this_step) {
307 mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
308 + matrix_[ch_pos - 1].mtrx_nd_num;
309 matrix_[ch_pos].mtrx_nd_num = 0;
310 } else {
311 mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
312 + matrix_[ch_pos].mtrx_nd_num;
313 }
314
315 // Modify fixed_hzs_
316 if (fixed_hzs_ > 0 &&
317 ((kLemmaIdComposing != lma_id_[0]) ||
318 (kLemmaIdComposing == lma_id_[0] &&
319 spl_start_[c_phrase_.length] <= ch_pos))) {
320 size_t fixed_ch_pos = ch_pos;
321 if (clear_fixed_this_step)
322 fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
323 while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
324 fixed_ch_pos--;
325
326 fixed_lmas_ = 0;
327 fixed_hzs_ = 0;
328 if (fixed_ch_pos > 0) {
329 while (spl_start_[fixed_hzs_] < fixed_ch_pos)
330 fixed_hzs_++;
331 assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
332
333 while (lma_start_[fixed_lmas_] < fixed_hzs_)
334 fixed_lmas_++;
335 assert(lma_start_[fixed_lmas_] == fixed_hzs_);
336 }
337
338 // Re-search the Pinyin string for the unlocked lemma
339 // which was previously fixed.
340 //
341 // Prepare mile stones of this step to clear.
342 MileStoneHandle *dict_handles_to_clear = NULL;
343 if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
344 matrix_[fixed_ch_pos].dmi_num > 0) {
345 dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
346 }
347
348 // If there are more steps, and this step is not allowed to clear, find
349 // milestones of next step.
350 if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
351 dict_handles_to_clear = NULL;
352 if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
353 dict_handles_to_clear =
354 dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
355 }
356 }
357
358 if (NULL != dict_handles_to_clear) {
359 dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
360 if (NULL != user_dict_)
361 user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
362 }
363
364
365 pys_decoded_len_ = fixed_ch_pos;
366
367 if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
368 dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
369 + matrix_[fixed_ch_pos - 1].dmi_num;
370 matrix_[fixed_ch_pos].dmi_num = 0;
371 } else {
372 dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
373 matrix_[fixed_ch_pos].dmi_num;
374 }
375
376 if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
377 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
378 + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
379 matrix_[fixed_ch_pos].mtrx_nd_num = 0;
380 } else {
381 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
382 + matrix_[fixed_ch_pos].mtrx_nd_num;
383 }
384
385 for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
386 add_char(pys_[re_pos]);
387 }
388 } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
389 for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
390 uint16 splpos_begin = c_phrase_.sublma_start[subpos];
391 uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
392 for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
393 // If ch_pos is in this spelling
394 uint16 spl_start = c_phrase_.spl_start[splpos];
395 uint16 spl_end = c_phrase_.spl_start[splpos + 1];
396 if (ch_pos >= spl_start && ch_pos < spl_end) {
397 // Clear everything after this position
398 c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
399 c_phrase_.sublma_start[subpos + 1] = splpos;
400 c_phrase_.sublma_num = subpos + 1;
401 c_phrase_.length = splpos;
402
403 if (splpos == splpos_begin) {
404 c_phrase_.sublma_num = subpos;
405 }
406 }
407 }
408 }
409
410 // Extend the composing phrase.
411 reset_search0();
412 dmi_c_phrase_ = true;
413 uint16 c_py_pos = 0;
414 while (c_py_pos < spl_start_[c_phrase_.length]) {
415 bool b_ac_tmp = add_char(pys_[c_py_pos]);
416 assert(b_ac_tmp);
417 c_py_pos++;
418 }
419 dmi_c_phrase_ = false;
420
421 lma_id_num_ = 1;
422 fixed_lmas_ = 1;
423 fixed_lmas_no1_[0] = 0; // A composing string is always modified.
424 fixed_hzs_ = c_phrase_.length;
425 lma_start_[1] = fixed_hzs_;
426 lma_id_[0] = kLemmaIdComposing;
427 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
428 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
429 }
430 }
431
432 return true;
433 }
434
del_in_pys(size_t start,size_t len)435 void MatrixSearch::del_in_pys(size_t start, size_t len) {
436 while (start < kMaxRowNum - len && '\0' != pys_[start]) {
437 pys_[start] = pys_[start + len];
438 start++;
439 }
440 }
441
search(const char * py,size_t py_len)442 size_t MatrixSearch::search(const char *py, size_t py_len) {
443 if (!inited_ || NULL == py)
444 return 0;
445
446 // If the search Pinyin string is too long, it will be truncated.
447 if (py_len > kMaxRowNum - 1)
448 py_len = kMaxRowNum - 1;
449
450 // Compare the new string with the previous one. Find their prefix to
451 // increase search efficiency.
452 size_t ch_pos = 0;
453 for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
454 if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
455 break;
456 }
457
458 bool clear_fix = true;
459 if (ch_pos == pys_decoded_len_)
460 clear_fix = false;
461
462 reset_search(ch_pos, clear_fix, false, false);
463
464 memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
465 pys_[py_len] = '\0';
466
467 while ('\0' != pys_[ch_pos]) {
468 if (!add_char(py[ch_pos])) {
469 pys_decoded_len_ = ch_pos;
470 break;
471 }
472 ch_pos++;
473 }
474
475 // Get spelling ids and starting positions.
476 get_spl_start_id();
477
478 // If there are too many spellings, remove the last letter until the spelling
479 // number is acceptable.
480 while (spl_id_num_ > 9) {
481 py_len--;
482 reset_search(py_len, false, false, false);
483 pys_[py_len] = '\0';
484 get_spl_start_id();
485 }
486
487 prepare_candidates();
488
489 if (kPrintDebug0) {
490 printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
491 printf("--DMI Pool Used: %d\n", dmi_pool_used_);
492
493 if (kPrintDebug1) {
494 for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
495 debug_print_dmi(pos, 1);
496 }
497 }
498 }
499
500 return ch_pos;
501 }
502
delsearch(size_t pos,bool is_pos_in_splid,bool clear_fixed_this_step)503 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
504 bool clear_fixed_this_step) {
505 if (!inited_)
506 return 0;
507
508 size_t reset_pos = pos;
509
510 // Out of range for both Pinyin mode and Spelling id mode.
511 if (pys_decoded_len_ <= pos) {
512 del_in_pys(pos, 1);
513
514 reset_pos = pys_decoded_len_;
515 // Decode the string after the un-decoded position
516 while ('\0' != pys_[reset_pos]) {
517 if (!add_char(pys_[reset_pos])) {
518 pys_decoded_len_ = reset_pos;
519 break;
520 }
521 reset_pos++;
522 }
523 get_spl_start_id();
524 prepare_candidates();
525 return pys_decoded_len_;
526 }
527
528 // Spelling id mode, but out of range.
529 if (is_pos_in_splid && spl_id_num_ <= pos)
530 return pys_decoded_len_;
531
532 // Begin to handle two modes respectively.
533 // Pinyin mode by default
534 size_t c_py_len = 0; // The length of composing phrase's Pinyin
535 size_t del_py_len = 1;
536 if (!is_pos_in_splid) {
537 // Pinyin mode is only allowed to delete beyond the fixed lemmas.
538 if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
539 return pys_decoded_len_;
540
541 del_in_pys(pos, 1);
542
543 // If the deleted character is just the one after the last fixed lemma
544 if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
545 // If all fixed lemmas have been merged, and the caller of the function
546 // request to unlock the last fixed lemma.
547 if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
548 // Unlock the last sub lemma in the composing phrase. Because it is not
549 // easy to unlock it directly. Instead, we re-decode the modified
550 // composing phrase.
551 c_phrase_.sublma_num--;
552 c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
553 reset_pos = spl_start_[c_phrase_.length];
554 c_py_len = reset_pos;
555 }
556 }
557 } else {
558 del_py_len = spl_start_[pos + 1] - spl_start_[pos];
559
560 del_in_pys(spl_start_[pos], del_py_len);
561
562 if (pos >= lma_start_[fixed_lmas_]) {
563 c_py_len = 0;
564 reset_pos = spl_start_[pos + 1] - del_py_len;
565 } else {
566 c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
567 reset_pos = c_py_len;
568 if (c_py_len > 0)
569 merge_fixed_lmas(pos);
570 }
571 }
572
573 if (c_py_len > 0) {
574 assert(c_phrase_.length > 0 && c_py_len ==
575 c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
576 // The composing phrase is valid, reset all search space,
577 // and begin a new search which will only extend the composing
578 // phrase.
579 reset_search0();
580
581 dmi_c_phrase_ = true;
582 // Extend the composing phrase.
583 uint16 c_py_pos = 0;
584 while (c_py_pos < c_py_len) {
585 bool b_ac_tmp = add_char(pys_[c_py_pos]);
586 assert(b_ac_tmp);
587 c_py_pos++;
588 }
589 dmi_c_phrase_ = false;
590
591 // Fixd the composing phrase as the first choice.
592 lma_id_num_ = 1;
593 fixed_lmas_ = 1;
594 fixed_lmas_no1_[0] = 0; // A composing string is always modified.
595 fixed_hzs_ = c_phrase_.length;
596 lma_start_[1] = fixed_hzs_;
597 lma_id_[0] = kLemmaIdComposing;
598 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
599 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
600 } else {
601 // Reseting search only clear pys_decoded_len_, but the string is kept.
602 reset_search(reset_pos, clear_fixed_this_step, false, false);
603 }
604
605 // Decode the string after the delete position.
606 while ('\0' != pys_[reset_pos]) {
607 if (!add_char(pys_[reset_pos])) {
608 pys_decoded_len_ = reset_pos;
609 break;
610 }
611 reset_pos++;
612 }
613
614 get_spl_start_id();
615 prepare_candidates();
616 return pys_decoded_len_;
617 }
618
get_candidate_num()619 size_t MatrixSearch::get_candidate_num() {
620 if (!inited_ || 0 == pys_decoded_len_ ||
621 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
622 return 0;
623
624 return 1 + lpi_total_;
625 }
626
get_candidate(size_t cand_id,char16 * cand_str,size_t max_len)627 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
628 size_t max_len) {
629 if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
630 return NULL;
631
632 if (0 == cand_id) {
633 return get_candidate0(cand_str, max_len, NULL, false);
634 } else {
635 cand_id--;
636 }
637
638 // For this case: the current sentence is a word only, and the user fixed it,
639 // so the result will be fixed to the sentence space, and
640 // lpi_total_ will be set to 0.
641 if (0 == lpi_total_) {
642 return get_candidate0(cand_str, max_len, NULL, false);
643 }
644
645 LemmaIdType id = lpi_items_[cand_id].id;
646 char16 s[kMaxLemmaSize + 1];
647
648 uint16 s_len = lpi_items_[cand_id].lma_len;
649 if (s_len > 1) {
650 s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
651 } else {
652 // For a single character, Hanzi is ready.
653 s[0] = lpi_items_[cand_id].hanzi;
654 s[1] = static_cast<char16>(0);
655 }
656
657 if (s_len > 0 && max_len > s_len) {
658 utf16_strncpy(cand_str, s, s_len);
659 cand_str[s_len] = (char16)'\0';
660 return cand_str;
661 }
662
663 return NULL;
664 }
665
update_dict_freq()666 void MatrixSearch::update_dict_freq() {
667 if (NULL != user_dict_) {
668 // Update the total frequency of all lemmas, including system lemmas and
669 // user dictionary lemmas.
670 size_t total_freq = user_dict_->get_total_lemma_count();
671 dict_trie_->set_total_lemma_count_of_others(total_freq);
672 }
673 }
674
add_lma_to_userdict(uint16 lma_fr,uint16 lma_to,float score)675 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
676 float score) {
677 if (lma_to - lma_fr <= 1 || NULL == user_dict_)
678 return false;
679
680 char16 word_str[kMaxLemmaSize + 1];
681 uint16 spl_ids[kMaxLemmaSize];
682
683 uint16 spl_id_fr = 0;
684
685 for (uint16 pos = lma_fr; pos < lma_to; pos++) {
686 LemmaIdType lma_id = lma_id_[pos];
687 if (is_user_lemma(lma_id)) {
688 user_dict_->update_lemma(lma_id, 1, true);
689 }
690 uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
691 utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
692
693 uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
694 kMaxLemmaSize + 1 - spl_id_fr);
695 assert(tmp == lma_len);
696
697 tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
698 if (tmp != lma_len) {
699 return false;
700 }
701
702 spl_id_fr += lma_len;
703 }
704
705 assert(spl_id_fr <= kMaxLemmaSize);
706
707 return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
708 spl_id_fr, 1);
709 }
710
debug_print_dmi(PoolPosType dmi_pos,uint16 nest_level)711 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
712 if (dmi_pos >= dmi_pool_used_) return;
713
714 DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
715
716 if (1 == nest_level) {
717 printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
718 }
719 if (dmi->dict_level > 1) {
720 debug_print_dmi(dmi->dmi_fr, nest_level + 1);
721 }
722 printf("---%d\n", dmi->dict_level);
723 printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
724 printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
725 get_spelling_str(dmi->spl_id), dmi->spl_id);
726 printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
727 if (1 == nest_level) {
728 printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
729 }
730 }
731
try_add_cand0_to_userdict()732 bool MatrixSearch::try_add_cand0_to_userdict() {
733 size_t new_cand_num = get_candidate_num();
734 if (fixed_hzs_ > 0 && 1 == new_cand_num) {
735 float score_from = 0;
736 uint16 lma_id_from = 0;
737 uint16 pos = 0;
738 bool modified = false;
739 while (pos < fixed_lmas_) {
740 if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
741 static_cast<uint16>(kMaxLemmaSize)) {
742 float score_to_add =
743 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
744 .mtrx_nd_pos].score - score_from;
745 if (modified) {
746 score_to_add += 1.0;
747 if (score_to_add > NGram::kMaxScore) {
748 score_to_add = NGram::kMaxScore;
749 }
750 add_lma_to_userdict(lma_id_from, pos, score_to_add);
751 }
752 lma_id_from = pos;
753 score_from += score_to_add;
754
755 // Clear the flag for next user lemma.
756 modified = false;
757 }
758
759 if (0 == fixed_lmas_no1_[pos]) {
760 modified = true;
761 }
762 pos++;
763 }
764
765 // Single-char word is not allowed to add to userdict.
766 if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
767 float score_to_add =
768 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
769 .mtrx_nd_pos].score - score_from;
770 if (modified) {
771 score_to_add += 1.0;
772 if (score_to_add > NGram::kMaxScore) {
773 score_to_add = NGram::kMaxScore;
774 }
775 add_lma_to_userdict(lma_id_from, pos, score_to_add);
776 }
777 }
778 }
779 return true;
780 }
781
782 // Choose a candidate, and give new candidates for next step.
783 // If user finishes selection, we will try to communicate with user dictionary
784 // to add new items or update score of some existing items.
785 //
786 // Basic rule:
787 // 1. If user selects the first choice:
788 // 1.1. If the first choice is not a sentence, instead, it is a lemma:
789 // 1.1.1. If the first choice is a user lemma, notify the user
790 // dictionary that a user lemma is hit, and add occuring count
791 // by 1.
792 // 1.1.2. If the first choice is a system lemma, do nothing.
793 // 1.2. If the first choice is a sentence containing more than one lemma:
794 // 1.2.1. The whole sentence will be added as a user lemma. If the
795 // sentence contains user lemmas, -> hit, and add occuring count
796 // by 1.
choose(size_t cand_id)797 size_t MatrixSearch::choose(size_t cand_id) {
798 if (!inited_ || 0 == pys_decoded_len_)
799 return 0;
800
801 if (0 == cand_id) {
802 fixed_hzs_ = spl_id_num_;
803 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
804 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
805 for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
806 fixed_lmas_no1_[pos] = 1;
807 }
808 fixed_lmas_ = lma_id_num_;
809 lpi_total_ = 0; // Clean all other candidates.
810
811 // 1. It is the first choice
812 if (1 == lma_id_num_) {
813 // 1.1. The first choice is not a sentence but a lemma
814 if (is_user_lemma(lma_id_[0])) {
815 // 1.1.1. The first choice is a user lemma, notify the user dictionary
816 // that it is hit.
817 if (NULL != user_dict_)
818 user_dict_->update_lemma(lma_id_[0], 1, true);
819 } else {
820 // 1.1.2. do thing for a system lemma.
821 }
822 } else {
823 // 1.2. The first choice is a sentence.
824 // 1.2.1 Try to add the whole sentence to user dictionary, the whole
825 // sentence may be splitted into many items.
826 if (NULL != user_dict_) {
827 try_add_cand0_to_userdict();
828 }
829 }
830 update_dict_freq();
831 return 1;
832 } else {
833 cand_id--;
834 }
835
836 // 2. It is not the full sentence candidate.
837 // Find the length of the candidate.
838 LemmaIdType id_chosen = lpi_items_[cand_id].id;
839 LmaScoreType score_chosen = lpi_items_[cand_id].psb;
840 size_t cand_len = lpi_items_[cand_id].lma_len;
841
842 assert(cand_len > 0);
843
844 // Notify the atom dictionary that this item is hit.
845 if (is_user_lemma(id_chosen)) {
846 if (NULL != user_dict_) {
847 user_dict_->update_lemma(id_chosen, 1, true);
848 }
849 update_dict_freq();
850 }
851
852 // 3. Fixed the chosen item.
853 // 3.1 Get the steps number.
854 size_t step_fr = spl_start_[fixed_hzs_];
855 size_t step_to = spl_start_[fixed_hzs_ + cand_len];
856
857 // 3.2 Save the length of the original string.
858 size_t pys_decoded_len = pys_decoded_len_;
859
860 // 3.2 Reset the space of the fixed part.
861 reset_search(step_to, false, false, true);
862
863 // 3.3 For the last character of the fixed part, the previous DMI
864 // information will be kept, while the MTRX information will be re-extended,
865 // and only one node will be extended.
866 matrix_[step_to].mtrx_nd_num = 0;
867
868 LmaPsbItem lpi_item;
869 lpi_item.psb = score_chosen;
870 lpi_item.id = id_chosen;
871
872 PoolPosType step_to_dmi_fr = match_dmi(step_to,
873 spl_id_ + fixed_hzs_, cand_len);
874 //assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
875
876 extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
877 step_to_dmi_fr, step_to);
878
879 matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
880 mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
881 matrix_[step_to].mtrx_nd_num;
882
883 if (id_chosen == lma_id_[fixed_lmas_])
884 fixed_lmas_no1_[fixed_lmas_] = 1;
885 else
886 fixed_lmas_no1_[fixed_lmas_] = 0;
887 lma_id_[fixed_lmas_] = id_chosen;
888 lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
889 fixed_lmas_++;
890 fixed_hzs_ = fixed_hzs_ + cand_len;
891
892 while (step_to != pys_decoded_len) {
893 bool b = add_char(pys_[step_to]);
894 assert(b);
895 step_to++;
896 }
897
898 if (fixed_hzs_ < spl_id_num_) {
899 prepare_candidates();
900 } else {
901 lpi_total_ = 0;
902 if (NULL != user_dict_) {
903 try_add_cand0_to_userdict();
904 }
905 }
906
907 return get_candidate_num();
908 }
909
cancel_last_choice()910 size_t MatrixSearch::cancel_last_choice() {
911 if (!inited_ || 0 == pys_decoded_len_)
912 return 0;
913
914 size_t step_start = 0;
915 if (fixed_hzs_ > 0) {
916 size_t step_end = spl_start_[fixed_hzs_];
917 MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
918 assert(NULL != end_node);
919
920 step_start = end_node->from->step;
921
922 if (step_start > 0) {
923 DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
924 fixed_hzs_ -= dmi->dict_level;
925 } else {
926 fixed_hzs_ = 0;
927 }
928
929 reset_search(step_start, false, false, false);
930
931 while (pys_[step_start] != '\0') {
932 bool b = add_char(pys_[step_start]);
933 assert(b);
934 step_start++;
935 }
936
937 prepare_candidates();
938 }
939 return get_candidate_num();
940 }
941
get_fixedlen()942 size_t MatrixSearch::get_fixedlen() {
943 if (!inited_ || 0 == pys_decoded_len_)
944 return 0;
945 return fixed_hzs_;
946 }
947
prepare_add_char(char ch)948 bool MatrixSearch::prepare_add_char(char ch) {
949 if (pys_decoded_len_ >= kMaxRowNum - 1 ||
950 (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
951 return false;
952
953 if (dmi_pool_used_ >= kDmiPoolSize) return false;
954
955 pys_[pys_decoded_len_] = ch;
956 pys_decoded_len_++;
957
958 MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
959 mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
960 mtrx_this_row->mtrx_nd_num = 0;
961 mtrx_this_row->dmi_pos = dmi_pool_used_;
962 mtrx_this_row->dmi_num = 0;
963 mtrx_this_row->dmi_has_full_id = 0;
964
965 return true;
966 }
967
is_split_at(uint16 pos)968 bool MatrixSearch::is_split_at(uint16 pos) {
969 return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
970 }
971
fill_dmi(DictMatchInfo * dmi,MileStoneHandle * handles,PoolPosType dmi_fr,uint16 spl_id,uint16 node_num,unsigned char dict_level,bool splid_end_split,unsigned char splstr_len,unsigned char all_full_id)972 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
973 PoolPosType dmi_fr, uint16 spl_id,
974 uint16 node_num, unsigned char dict_level,
975 bool splid_end_split, unsigned char splstr_len,
976 unsigned char all_full_id) {
977 dmi->dict_handles[0] = handles[0];
978 dmi->dict_handles[1] = handles[1];
979 dmi->dmi_fr = dmi_fr;
980 dmi->spl_id = spl_id;
981 dmi->dict_level = dict_level;
982 dmi->splid_end_split = splid_end_split ? 1 : 0;
983 dmi->splstr_len = splstr_len;
984 dmi->all_full_id = all_full_id;
985 dmi->c_phrase = 0;
986 }
987
add_char(char ch)988 bool MatrixSearch::add_char(char ch) {
989 if (!prepare_add_char(ch))
990 return false;
991 return add_char_qwerty();
992 }
993
add_char_qwerty()994 bool MatrixSearch::add_char_qwerty() {
995 matrix_[pys_decoded_len_].mtrx_nd_num = 0;
996
997 bool spl_matched = false;
998 uint16 longest_ext = 0;
999 // Extend the search matrix, from the oldest unfixed row. ext_len means
1000 // extending length.
1001 for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
1002 if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
1003 continue;
1004
1005 // Refer to the declaration of the variable dmi_has_full_id for the
1006 // explanation of this piece of code. In one word, it is used to prevent
1007 // from the unwise extending of "shoud ou" but allow the reasonable
1008 // extending of "heng ao", "lang a", etc.
1009 if (ext_len > 1 && 0 != longest_ext &&
1010 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
1011 if (xi_an_enabled_)
1012 continue;
1013 else
1014 break;
1015 }
1016
1017 uint16 oldrow = pys_decoded_len_ - ext_len;
1018
1019 // 0. If that row is before the last fixed step, ignore.
1020 if (spl_start_[fixed_hzs_] > oldrow)
1021 continue;
1022
1023 // 1. Check if that old row has valid MatrixNode. If no, means that row is
1024 // not a boundary, either a word boundary or a spelling boundary.
1025 // If it is for extending composing phrase, it's OK to ignore the 0.
1026 if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
1027 continue;
1028
1029 // 2. Get spelling id(s) for the last ext_len chars.
1030 uint16 spl_idx;
1031 bool is_pre = false;
1032 spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
1033 ext_len, &is_pre);
1034 if (is_pre)
1035 spl_matched = true;
1036
1037 if (0 == spl_idx)
1038 continue;
1039
1040 bool splid_end_split = is_split_at(oldrow + ext_len);
1041
1042 // 3. Extend the DMI nodes of that old row
1043 // + 1 is to extend an extra node from the root
1044 for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
1045 dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
1046 dmi_pos++) {
1047 DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
1048 if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
1049 dmi = NULL; // The last one, NULL means extending from the root.
1050 } else {
1051 // If the dmi is covered by the fixed arrange, ignore it.
1052 if (fixed_hzs_ > 0 &&
1053 pys_decoded_len_ - ext_len - dmi->splstr_len <
1054 spl_start_[fixed_hzs_]) {
1055 continue;
1056 }
1057 // If it is not in mode for composing phrase, and the source DMI node
1058 // is marked for composing phrase, ignore this node.
1059 if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
1060 continue;
1061 }
1062 }
1063
1064 // For example, if "gao" is extended, "g ao" is not allowed.
1065 // or "zh" has been passed, "z h" is not allowed.
1066 // Both word and word-connection will be prevented.
1067 if (longest_ext > ext_len) {
1068 if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
1069 continue;
1070 }
1071
1072 // "z h" is not allowed.
1073 if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
1074 continue;
1075 }
1076 }
1077
1078 dep_->splids_extended = 0;
1079 if (NULL != dmi) {
1080 uint16 prev_ids_num = dmi->dict_level;
1081 if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
1082 (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) {
1083 continue;
1084 }
1085
1086 DictMatchInfo *d = dmi;
1087 while (d) {
1088 dep_->splids[--prev_ids_num] = d->spl_id;
1089 if ((PoolPosType)-1 == d->dmi_fr)
1090 break;
1091 d = dmi_pool_ + d->dmi_fr;
1092 }
1093 assert(0 == prev_ids_num);
1094 dep_->splids_extended = dmi->dict_level;
1095 }
1096 dep_->splids[dep_->splids_extended] = spl_idx;
1097 dep_->ext_len = ext_len;
1098 dep_->splid_end_split = splid_end_split;
1099
1100 dep_->id_num = 1;
1101 dep_->id_start = spl_idx;
1102 if (spl_trie_->is_half_id(spl_idx)) {
1103 // Get the full id list
1104 dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
1105 assert(dep_->id_num > 0);
1106 }
1107
1108 uint16 new_dmi_num;
1109
1110 new_dmi_num = extend_dmi(dep_, dmi);
1111
1112 if (new_dmi_num > 0) {
1113 if (dmi_c_phrase_) {
1114 dmi_pool_[dmi_pool_used_].c_phrase = 1;
1115 }
1116 matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
1117 dmi_pool_used_ += new_dmi_num;
1118
1119 if (!spl_trie_->is_half_id(spl_idx))
1120 matrix_[pys_decoded_len_].dmi_has_full_id = 1;
1121 }
1122
1123 // If get candiate lemmas, try to extend the path
1124 if (lpi_total_ > 0) {
1125 uint16 fr_row;
1126 if (NULL == dmi) {
1127 fr_row = oldrow;
1128 } else {
1129 assert(oldrow >= dmi->splstr_len);
1130 fr_row = oldrow - dmi->splstr_len;
1131 }
1132 for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
1133 mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
1134 matrix_[fr_row].mtrx_nd_num;
1135 mtrx_nd_pos++) {
1136 MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
1137
1138 extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
1139 dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
1140 if (longest_ext == 0)
1141 longest_ext = ext_len;
1142 }
1143 }
1144 } // for dmi_pos
1145 } // for ext_len
1146 mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
1147
1148 if (dmi_c_phrase_)
1149 return true;
1150
1151 return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
1152 }
1153
prepare_candidates()1154 void MatrixSearch::prepare_candidates() {
1155 // Get candiates from the first un-fixed step.
1156 uint16 lma_size_max = kMaxLemmaSize;
1157 if (lma_size_max > spl_id_num_ - fixed_hzs_)
1158 lma_size_max = spl_id_num_ - fixed_hzs_;
1159
1160 uint16 lma_size = lma_size_max;
1161
1162 // If the full sentense candidate's unfixed part may be the same with a normal
1163 // lemma. Remove the lemma candidate in this case.
1164 char16 fullsent[kMaxLemmaSize + 1];
1165 char16 *pfullsent = NULL;
1166 uint16 sent_len;
1167 pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
1168
1169 // If the unfixed part contains more than one ids, it is not necessary to
1170 // check whether a lemma's string is the same to the unfixed part of the full
1171 // sentence candidate, so, set it to NULL;
1172 if (sent_len > kMaxLemmaSize)
1173 pfullsent = NULL;
1174
1175 lpi_total_ = 0;
1176 size_t lpi_num_full_match = 0; // Number of items which are fully-matched.
1177 while (lma_size > 0) {
1178 size_t lma_num;
1179 lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
1180 lpi_items_ + lpi_total_,
1181 size_t(kMaxLmaPsbItems - lpi_total_),
1182 pfullsent, lma_size == lma_size_max);
1183
1184 if (lma_num > 0) {
1185 lpi_total_ += lma_num;
1186 // For next lemma candidates which are not the longest, it is not
1187 // necessary to compare with the full sentence candiate.
1188 pfullsent = NULL;
1189 }
1190 if (lma_size == lma_size_max) {
1191 lpi_num_full_match = lpi_total_;
1192 }
1193 lma_size--;
1194 }
1195
1196 // Sort those partially-matched items by their unified scores.
1197 myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
1198 sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
1199
1200 if (kPrintDebug0) {
1201 printf("-----Prepare candidates, score:\n");
1202 for (size_t a = 0; a < lpi_total_; a++) {
1203 printf("[%03d]%d ", a, lpi_items_[a].psb);
1204 if ((a + 1) % 6 == 0) printf("\n");
1205 }
1206 printf("\n");
1207 }
1208
1209 if (kPrintDebug0) {
1210 printf("--- lpi_total_ = %d\n", lpi_total_);
1211 }
1212 }
1213
get_pystr(size_t * decoded_len)1214 const char* MatrixSearch::get_pystr(size_t *decoded_len) {
1215 if (!inited_ || NULL == decoded_len)
1216 return NULL;
1217
1218 *decoded_len = pys_decoded_len_;
1219 return pys_;
1220 }
1221
merge_fixed_lmas(size_t del_spl_pos)1222 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
1223 if (fixed_lmas_ == 0)
1224 return;
1225 // Update spelling segmentation information first.
1226 spl_id_num_ -= 1;
1227 uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
1228 for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
1229 spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
1230 if (pos == spl_id_num_)
1231 break;
1232 spl_id_[pos] = spl_id_[pos + 1];
1233 }
1234
1235 // Begin to merge.
1236 uint16 phrase_len = 0;
1237
1238 // Update the spelling ids to the composing phrase.
1239 // We need to convert these ids into full id in the future.
1240 memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
1241 memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
1242
1243 // If composing phrase has not been created, first merge all fixed
1244 // lemmas into a composing phrase without deletion.
1245 if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
1246 uint16 bp = 1; // Begin position of real fixed lemmas.
1247 // There is no existing composing phrase.
1248 if (kLemmaIdComposing != lma_id_[0]) {
1249 c_phrase_.sublma_num = 0;
1250 bp = 0;
1251 }
1252
1253 uint16 sub_num = c_phrase_.sublma_num;
1254 for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
1255 c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
1256 if (lma_start_[pos] > del_spl_pos) {
1257 c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
1258 }
1259
1260 if (pos == fixed_lmas_)
1261 break;
1262
1263 uint16 lma_len;
1264 char16 *lma_str = c_phrase_.chn_str +
1265 c_phrase_.sublma_start[sub_num] + phrase_len;
1266
1267 lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
1268 assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
1269 phrase_len += lma_len;
1270 }
1271 assert(phrase_len == lma_start_[fixed_lmas_]);
1272 c_phrase_.length = phrase_len; // will be deleted by 1
1273 c_phrase_.sublma_num += fixed_lmas_ - bp;
1274 } else {
1275 for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
1276 if (c_phrase_.sublma_start[pos] > del_spl_pos) {
1277 c_phrase_.sublma_start[pos] -= 1;
1278 }
1279 }
1280 phrase_len = c_phrase_.length;
1281 }
1282
1283 assert(phrase_len > 0);
1284 if (1 == phrase_len) {
1285 // After the only one is deleted, nothing will be left.
1286 fixed_lmas_ = 0;
1287 return;
1288 }
1289
1290 // Delete the Chinese character in the merged phrase.
1291 // The corresponding elements in spl_ids and spl_start of the
1292 // phrase have been deleted.
1293 char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
1294 for (uint16 pos = 0;
1295 pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
1296 pos++) {
1297 chn_str[pos] = chn_str[pos + 1];
1298 }
1299 c_phrase_.length -= 1;
1300
1301 // If the deleted spelling id is in a sub lemma which contains more than
1302 // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
1303 // which only contains 1 id, the whole sub lemma needs to be deleted, so
1304 // del_a_sub will be true.
1305 bool del_a_sub = false;
1306 for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
1307 if (c_phrase_.sublma_start[pos - 1] ==
1308 c_phrase_.sublma_start[pos]) {
1309 del_a_sub = true;
1310 }
1311 if (del_a_sub) {
1312 c_phrase_.sublma_start[pos - 1] =
1313 c_phrase_.sublma_start[pos];
1314 }
1315 }
1316 if (del_a_sub)
1317 c_phrase_.sublma_num -= 1;
1318
1319 return;
1320 }
1321
get_spl_start_id()1322 void MatrixSearch::get_spl_start_id() {
1323 lma_id_num_ = 0;
1324 lma_start_[0] = 0;
1325
1326 spl_id_num_ = 0;
1327 spl_start_[0] = 0;
1328 if (!inited_ || 0 == pys_decoded_len_ ||
1329 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
1330 return;
1331
1332 // Calculate number of lemmas and spellings
1333 // Only scan those part which is not fixed.
1334 lma_id_num_ = fixed_lmas_;
1335 spl_id_num_ = fixed_hzs_;
1336
1337 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1338 while (mtrx_nd != mtrx_nd_pool_) {
1339 if (fixed_hzs_ > 0) {
1340 if (mtrx_nd->step <= spl_start_[fixed_hzs_])
1341 break;
1342 }
1343
1344 // Update the spelling segamentation information
1345 unsigned char word_splstr_len = 0;
1346 PoolPosType dmi_fr = mtrx_nd->dmi_fr;
1347 if ((PoolPosType)-1 != dmi_fr)
1348 word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
1349
1350 while ((PoolPosType)-1 != dmi_fr) {
1351 spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
1352 (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
1353 spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
1354 spl_id_num_++;
1355 dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
1356 }
1357
1358 // Update the lemma segmentation information
1359 lma_start_[lma_id_num_ + 1] = spl_id_num_;
1360 lma_id_[lma_id_num_] = mtrx_nd->id;
1361 lma_id_num_++;
1362
1363 mtrx_nd = mtrx_nd->from;
1364 }
1365
1366 // Reverse the result of spelling info
1367 for (size_t pos = fixed_hzs_;
1368 pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
1369 if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
1370 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1371 spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
1372 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1373
1374 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
1375 spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
1376 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
1377 }
1378 }
1379
1380 // Reverse the result of lemma info
1381 for (size_t pos = fixed_lmas_;
1382 pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
1383 assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
1384
1385 if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
1386 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1387 lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
1388 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1389
1390 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1391 lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
1392 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1393 }
1394 }
1395
1396 for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
1397 if (pos < lma_id_num_)
1398 lma_start_[pos] = lma_start_[pos - 1] +
1399 (lma_start_[pos] - lma_start_[pos + 1]);
1400 else
1401 lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
1402 lma_start_[fixed_lmas_];
1403 }
1404
1405 // Find the last fixed position
1406 fixed_hzs_ = 0;
1407 for (size_t pos = spl_id_num_; pos > 0; pos--) {
1408 if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
1409 fixed_hzs_ = pos;
1410 break;
1411 }
1412 }
1413
1414 return;
1415 }
1416
get_spl_start(const uint16 * & spl_start)1417 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
1418 get_spl_start_id();
1419 spl_start = spl_start_;
1420 return spl_id_num_;
1421 }
1422
extend_dmi(DictExtPara * dep,DictMatchInfo * dmi_s)1423 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
1424 if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1425
1426 if (dmi_c_phrase_)
1427 return extend_dmi_c(dep, dmi_s);
1428
1429 LpiCache& lpi_cache = LpiCache::get_instance();
1430 uint16 splid = dep->splids[dep->splids_extended];
1431
1432 bool cached = false;
1433 if (0 == dep->splids_extended)
1434 cached = lpi_cache.is_cached(splid);
1435
1436 // 1. If this is a half Id, get its corresponding full starting Id and
1437 // number of full Id.
1438 size_t ret_val = 0;
1439 PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node
1440
1441 lpi_total_ = 0;
1442
1443 MileStoneHandle from_h[3];
1444 from_h[0] = 0;
1445 from_h[1] = 0;
1446
1447 if (0 != dep->splids_extended) {
1448 from_h[0] = dmi_s->dict_handles[0];
1449 from_h[1] = dmi_s->dict_handles[1];
1450 }
1451
1452 // 2. Begin exgtending in the system dictionary
1453 size_t lpi_num = 0;
1454 MileStoneHandle handles[2];
1455 handles[0] = handles[1] = 0;
1456 if (from_h[0] > 0 || NULL == dmi_s) {
1457 handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
1458 kMaxLmaPsbItems, &lpi_num);
1459 }
1460 if (handles[0] > 0)
1461 lpi_total_ = lpi_num;
1462
1463 if (NULL == dmi_s) { // from root
1464 assert(0 != handles[0]);
1465 mtrx_dmi_fr = dmi_pool_used_;
1466 }
1467
1468 // 3. Begin extending in the user dictionary
1469 if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
1470 handles[1] = user_dict_->extend_dict(from_h[1], dep,
1471 lpi_items_ + lpi_total_,
1472 kMaxLmaPsbItems - lpi_total_,
1473 &lpi_num);
1474 if (handles[1] > 0) {
1475 if (kPrintDebug0) {
1476 for (size_t t = 0; t < lpi_num; t++) {
1477 printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
1478 lpi_items_[lpi_total_ + t].psb);
1479 }
1480 }
1481 lpi_total_ += lpi_num;
1482 }
1483 }
1484
1485 if (0 != handles[0] || 0 != handles[1]) {
1486 if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1487
1488 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1489 if (NULL == dmi_s) {
1490 fill_dmi(dmi_add, handles,
1491 (PoolPosType)-1, splid,
1492 1, 1, dep->splid_end_split, dep->ext_len,
1493 spl_trie_->is_half_id(splid) ? 0 : 1);
1494 } else {
1495 fill_dmi(dmi_add, handles,
1496 dmi_s - dmi_pool_, splid, 1,
1497 dmi_s->dict_level + 1, dep->splid_end_split,
1498 dmi_s->splstr_len + dep->ext_len,
1499 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1500 }
1501
1502 ret_val = 1;
1503 }
1504
1505 if (!cached) {
1506 if (0 == lpi_total_)
1507 return ret_val;
1508
1509 if (kPrintDebug0) {
1510 printf("--- lpi_total_ = %d\n", lpi_total_);
1511 }
1512
1513 myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1514 if (NULL == dmi_s && spl_trie_->is_half_id(splid))
1515 lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
1516 } else {
1517 assert(spl_trie_->is_half_id(splid));
1518 lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
1519 }
1520
1521 return ret_val;
1522 }
1523
extend_dmi_c(DictExtPara * dep,DictMatchInfo * dmi_s)1524 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
1525 lpi_total_ = 0;
1526
1527 uint16 pos = dep->splids_extended;
1528 assert(dmi_c_phrase_);
1529 if (pos >= c_phrase_.length)
1530 return 0;
1531
1532 uint16 splid = dep->splids[pos];
1533 if (splid == c_phrase_.spl_ids[pos]) {
1534 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1535 MileStoneHandle handles[2]; // Actually never used.
1536 if (NULL == dmi_s)
1537 fill_dmi(dmi_add, handles,
1538 (PoolPosType)-1, splid,
1539 1, 1, dep->splid_end_split, dep->ext_len,
1540 spl_trie_->is_half_id(splid) ? 0 : 1);
1541 else
1542 fill_dmi(dmi_add, handles,
1543 dmi_s - dmi_pool_, splid, 1,
1544 dmi_s->dict_level + 1, dep->splid_end_split,
1545 dmi_s->splstr_len + dep->ext_len,
1546 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1547
1548 if (pos == c_phrase_.length - 1) {
1549 lpi_items_[0].id = kLemmaIdComposing;
1550 lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score.
1551 lpi_total_ = 1;
1552 }
1553 return 1;
1554 }
1555 return 0;
1556 }
1557
extend_mtrx_nd(MatrixNode * mtrx_nd,LmaPsbItem lpi_items[],size_t lpi_num,PoolPosType dmi_fr,size_t res_row)1558 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
1559 size_t lpi_num, PoolPosType dmi_fr,
1560 size_t res_row) {
1561 assert(NULL != mtrx_nd);
1562 matrix_[res_row].mtrx_nd_fixed = NULL;
1563
1564 if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
1565 return 0;
1566
1567 if (0 == mtrx_nd->step) {
1568 // Because the list is sorted, if the source step is 0, it is only
1569 // necessary to pick up the first kMaxNodeARow items.
1570 if (lpi_num > kMaxNodeARow)
1571 lpi_num = kMaxNodeARow;
1572 }
1573
1574 MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
1575 for (size_t pos = 0; pos < lpi_num; pos++) {
1576 float score = mtrx_nd->score + lpi_items[pos].psb;
1577 if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
1578 break;
1579
1580 // Try to add a new node
1581 size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
1582 MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
1583 bool replace = false;
1584 // Find its position
1585 while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
1586 if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
1587 *mtrx_nd_res = *(mtrx_nd_res - 1);
1588 mtrx_nd_res--;
1589 replace = true;
1590 }
1591 if (replace || (mtrx_nd_num < kMaxNodeARow &&
1592 matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
1593 mtrx_nd_res->id = lpi_items[pos].id;
1594 mtrx_nd_res->score = score;
1595 mtrx_nd_res->from = mtrx_nd;
1596 mtrx_nd_res->dmi_fr = dmi_fr;
1597 mtrx_nd_res->step = res_row;
1598 if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
1599 matrix_[res_row].mtrx_nd_num++;
1600 }
1601 }
1602 return matrix_[res_row].mtrx_nd_num;
1603 }
1604
match_dmi(size_t step_to,uint16 spl_ids[],uint16 spl_id_num)1605 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
1606 uint16 spl_id_num) {
1607 if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
1608 return static_cast<PoolPosType>(-1);
1609 }
1610
1611 for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
1612 DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
1613
1614 if (dmi->dict_level != spl_id_num)
1615 continue;
1616
1617 bool matched = true;
1618 for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
1619 if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
1620 matched = false;
1621 break;
1622 }
1623
1624 dmi = dmi_pool_ + dmi->dmi_fr;
1625 }
1626 if (matched) {
1627 return matrix_[step_to].dmi_pos + dmi_pos;
1628 }
1629 }
1630
1631 return static_cast<PoolPosType>(-1);
1632 }
1633
get_candidate0(char16 * cand_str,size_t max_len,uint16 * retstr_len,bool only_unfixed)1634 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
1635 uint16 *retstr_len,
1636 bool only_unfixed) {
1637 if (pys_decoded_len_ == 0 ||
1638 matrix_[pys_decoded_len_].mtrx_nd_num == 0)
1639 return NULL;
1640
1641 LemmaIdType idxs[kMaxRowNum];
1642 size_t id_num = 0;
1643
1644 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1645
1646 if (kPrintDebug0) {
1647 printf("--- sentence score: %f\n", mtrx_nd->score);
1648 }
1649
1650 if (kPrintDebug1) {
1651 printf("==============Sentence DMI (reverse order) begin===========>>\n");
1652 }
1653
1654 while (mtrx_nd != NULL) {
1655 idxs[id_num] = mtrx_nd->id;
1656 id_num++;
1657
1658 if (kPrintDebug1) {
1659 printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
1660 mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
1661 debug_print_dmi(mtrx_nd->dmi_fr, 1);
1662 }
1663
1664 mtrx_nd = mtrx_nd->from;
1665 }
1666
1667 if (kPrintDebug1) {
1668 printf("<<==============Sentence DMI (reverse order) end=============\n");
1669 }
1670
1671 size_t ret_pos = 0;
1672 do {
1673 id_num--;
1674 if (0 == idxs[id_num])
1675 continue;
1676
1677 char16 str[kMaxLemmaSize + 1];
1678 uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
1679 if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
1680 (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
1681 if (!only_unfixed)
1682 utf16_strncpy(cand_str + ret_pos, str, str_len);
1683 else if (ret_pos >= fixed_hzs_)
1684 utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
1685
1686 ret_pos += str_len;
1687 } else {
1688 return NULL;
1689 }
1690 } while (id_num != 0);
1691
1692 if (!only_unfixed) {
1693 if (NULL != retstr_len)
1694 *retstr_len = ret_pos;
1695 cand_str[ret_pos] = (char16)'\0';
1696 } else {
1697 if (NULL != retstr_len)
1698 *retstr_len = ret_pos - fixed_hzs_;
1699 cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
1700 }
1701 return cand_str;
1702 }
1703
get_lpis(const uint16 * splid_str,size_t splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf,const char16 * pfullsent,bool sort_by_psb)1704 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
1705 LmaPsbItem* lma_buf, size_t max_lma_buf,
1706 const char16 *pfullsent, bool sort_by_psb) {
1707 if (splid_str_len > kMaxLemmaSize)
1708 return 0;
1709
1710 size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
1711 lma_buf, max_lma_buf);
1712 size_t num2 = 0;
1713 if (NULL != user_dict_) {
1714 num2 = user_dict_->get_lpis(splid_str, splid_str_len,
1715 lma_buf + num1, max_lma_buf - num1);
1716 }
1717
1718 size_t num = num1 + num2;
1719
1720 if (0 == num)
1721 return 0;
1722
1723 // Remove repeated items.
1724 if (splid_str_len > 1) {
1725 LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
1726 size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
1727 sizeof(LmaPsbStrItem);
1728 //assert(lpsi_num > num);
1729 if (num > lpsi_num) num = lpsi_num;
1730 lpsi_num = num;
1731
1732 for (size_t pos = 0; pos < lpsi_num; pos++) {
1733 lpsis[pos].lpi = lma_buf[pos];
1734 get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
1735 }
1736
1737 myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
1738
1739 size_t remain_num = 0;
1740 for (size_t pos = 0; pos < lpsi_num; pos++) {
1741 if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
1742 if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
1743 assert(remain_num > 0);
1744 lma_buf[remain_num - 1] = lpsis[pos].lpi;
1745 }
1746 continue;
1747 }
1748 if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
1749 continue;
1750
1751 lma_buf[remain_num] = lpsis[pos].lpi;
1752 remain_num++;
1753 }
1754
1755 // Update the result number
1756 num = remain_num;
1757 } else {
1758 // For single character, some characters have more than one spelling, for
1759 // example, "de" and "di" are all valid for a Chinese character, so when
1760 // the user input "d", repeated items are generated.
1761 // For single character lemmas, Hanzis will be gotten
1762 for (size_t pos = 0; pos < num; pos++) {
1763 char16 hanzis[2];
1764 get_lemma_str(lma_buf[pos].id, hanzis, 2);
1765 lma_buf[pos].hanzi = hanzis[0];
1766 }
1767
1768 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
1769
1770 size_t remain_num = 0;
1771 for (size_t pos = 0; pos < num; pos++) {
1772 if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
1773 if (NULL != pfullsent &&
1774 static_cast<char16>(0) == pfullsent[1] &&
1775 lma_buf[pos].hanzi == pfullsent[0])
1776 continue;
1777
1778 if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
1779 assert(remain_num > 0);
1780 assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
1781 lma_buf[remain_num - 1] = lma_buf[pos];
1782 }
1783 continue;
1784 }
1785 if (NULL != pfullsent &&
1786 static_cast<char16>(0) == pfullsent[1] &&
1787 lma_buf[pos].hanzi == pfullsent[0])
1788 continue;
1789
1790 lma_buf[remain_num] = lma_buf[pos];
1791 remain_num++;
1792 }
1793
1794 num = remain_num;
1795 }
1796
1797 if (sort_by_psb) {
1798 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1799 }
1800 return num;
1801 }
1802
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)1803 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
1804 uint16 str_max) {
1805 uint16 str_len = 0;
1806
1807 if (is_system_lemma(id_lemma)) {
1808 str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
1809 } else if (is_user_lemma(id_lemma)) {
1810 if (NULL != user_dict_) {
1811 str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
1812 } else {
1813 str_len = 0;
1814 str_buf[0] = static_cast<char16>('\0');
1815 }
1816 } else if (is_composing_lemma(id_lemma)) {
1817 if (str_max <= 1)
1818 return 0;
1819 str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
1820 if (str_len > str_max - 1)
1821 str_len = str_max - 1;
1822 utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
1823 str_buf[str_len] = (char16)'\0';
1824 return str_len;
1825 }
1826
1827 return str_len;
1828 }
1829
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)1830 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
1831 uint16 splids_max, bool arg_valid) {
1832 uint16 splid_num = 0;
1833
1834 if (arg_valid) {
1835 for (splid_num = 0; splid_num < splids_max; splid_num++) {
1836 if (spl_trie_->is_half_id(splids[splid_num]))
1837 break;
1838 }
1839 if (splid_num == splids_max)
1840 return splid_num;
1841 }
1842
1843 if (is_system_lemma(id_lemma)) {
1844 splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
1845 arg_valid);
1846 } else if (is_user_lemma(id_lemma)) {
1847 if (NULL != user_dict_) {
1848 splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
1849 arg_valid);
1850 } else {
1851 splid_num = 0;
1852 }
1853 } else if (is_composing_lemma(id_lemma)) {
1854 if (c_phrase_.length > splids_max) {
1855 return 0;
1856 }
1857 for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
1858 splids[pos] = c_phrase_.spl_ids[pos];
1859 if (spl_trie_->is_half_id(splids[pos])) {
1860 return 0;
1861 }
1862 }
1863 }
1864 return splid_num;
1865 }
1866
inner_predict(const char16 * fixed_buf,uint16 fixed_len,char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1867 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
1868 char16 predict_buf[][kMaxPredictSize + 1],
1869 size_t buf_len) {
1870 size_t res_total = 0;
1871 memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
1872 // In order to shorten the comments, j-character candidates predicted by
1873 // i-character prefix are called P(i,j). All candiates predicted by
1874 // i-character prefix are called P(i,*)
1875 // Step 1. Get P(kMaxPredictSize, *) and sort them, here
1876 // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
1877 for (size_t len = fixed_len; len >0; len--) {
1878 // How many blank items are available
1879 size_t this_max = npre_items_len_ - res_total;
1880 size_t res_this;
1881 // If the history is longer than 1, and we can not get prediction from
1882 // lemmas longer than 2, in this case, we will add lemmas with
1883 // highest scores as the prediction result.
1884 if (fixed_len > 1 && 1 == len && 0 == res_total) {
1885 // Try to find if recent n (n>1) characters can be a valid lemma in system
1886 // dictionary.
1887 bool nearest_n_word = false;
1888 for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
1889 if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
1890 nearest_n_word = true;
1891 break;
1892 }
1893 }
1894 res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
1895 npre_items_ + res_total,
1896 this_max, res_total);
1897 res_total += res_this;
1898 }
1899
1900 // How many blank items are available
1901 this_max = npre_items_len_ - res_total;
1902 res_this = 0;
1903 if (!kOnlyUserDictPredict) {
1904 res_this =
1905 dict_trie_->predict(fixed_buf + fixed_len - len, len,
1906 npre_items_ + res_total, this_max,
1907 res_total);
1908 }
1909
1910 if (NULL != user_dict_) {
1911 res_this = res_this +
1912 user_dict_->predict(fixed_buf + fixed_len - len, len,
1913 npre_items_ + res_total + res_this,
1914 this_max - res_this, res_total + res_this);
1915 }
1916
1917 if (kPredictLimitGt1) {
1918 myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
1919 cmp_npre_by_score);
1920
1921 if (len > 3) {
1922 if (res_this > kMaxPredictNumByGt3)
1923 res_this = kMaxPredictNumByGt3;
1924 } else if (3 == len) {
1925 if (res_this > kMaxPredictNumBy3)
1926 res_this = kMaxPredictNumBy3;
1927 } else if (2 == len) {
1928 if (res_this > kMaxPredictNumBy2)
1929 res_this = kMaxPredictNumBy2;
1930 }
1931 }
1932
1933 res_total += res_this;
1934 }
1935
1936 res_total = remove_duplicate_npre(npre_items_, res_total);
1937
1938 if (kPreferLongHistoryPredict) {
1939 myqsort(npre_items_, res_total, sizeof(NPredictItem),
1940 cmp_npre_by_hislen_score);
1941 } else {
1942 myqsort(npre_items_, res_total, sizeof(NPredictItem),
1943 cmp_npre_by_score);
1944 }
1945
1946 if (buf_len < res_total) {
1947 res_total = buf_len;
1948 }
1949
1950 if (kPrintDebug2) {
1951 printf("/////////////////Predicted Items Begin////////////////////>>\n");
1952 for (size_t i = 0; i < res_total; i++) {
1953 printf("---");
1954 for (size_t j = 0; j < kMaxPredictSize; j++) {
1955 printf("%d ", npre_items_[i].pre_hzs[j]);
1956 }
1957 printf("\n");
1958 }
1959 printf("<<///////////////Predicted Items End////////////////////////\n");
1960 }
1961
1962 for (size_t i = 0; i < res_total; i++) {
1963 utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
1964 kMaxPredictSize);
1965 predict_buf[i][kMaxPredictSize] = '\0';
1966 }
1967
1968 return res_total;
1969 }
1970
get_predicts(const char16 fixed_buf[],char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1971 size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
1972 char16 predict_buf[][kMaxPredictSize + 1],
1973 size_t buf_len) {
1974 size_t fixed_len = utf16_strlen(fixed_buf);
1975 if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
1976 return 0;
1977
1978 return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
1979 }
1980
1981 } // namespace ime_pinyin
1982