xref: /OK3568_Linux_fs/app/forlinx/flapp/src/keyboard/pinyin/share/matrixsearch.cpp (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include "../include/lpicache.h"
22 #include "../include/matrixsearch.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25 #include "../include/userdict.h"
26 
27 namespace ime_pinyin {
28 
29 #define PRUMING_SCORE 8000.0
30 
MatrixSearch()31 MatrixSearch::MatrixSearch() {
32   inited_ = false;
33   spl_trie_ = SpellingTrie::get_cpinstance();
34 
35   reset_pointers_to_null();
36 
37   pys_decoded_len_ = 0;
38   mtrx_nd_pool_used_ = 0;
39   dmi_pool_used_ = 0;
40   xi_an_enabled_ = false;
41   dmi_c_phrase_ = false;
42 
43   assert(kMaxSearchSteps > 0);
44   max_sps_len_ = kMaxSearchSteps - 1;
45   max_hzs_len_ = kMaxSearchSteps;
46 }
47 
~MatrixSearch()48 MatrixSearch::~MatrixSearch() {
49   free_resource();
50 }
51 
reset_pointers_to_null()52 void MatrixSearch::reset_pointers_to_null() {
53   dict_trie_ = NULL;
54   user_dict_ = NULL;
55   spl_parser_ = NULL;
56 
57   share_buf_ = NULL;
58 
59   // The following four buffers are used for decoding, and they are based on
60   // share_buf_, no need to delete them.
61   mtrx_nd_pool_ = NULL;
62   dmi_pool_ = NULL;
63   matrix_ = NULL;
64   dep_ = NULL;
65 
66   // Based on share_buf_, no need to delete them.
67   npre_items_ = NULL;
68 }
69 
alloc_resource()70 bool MatrixSearch::alloc_resource() {
71   free_resource();
72 
73   dict_trie_ = new DictTrie();
74   user_dict_ = static_cast<AtomDictBase*>(new UserDict());
75   spl_parser_ = new SpellingParser();
76 
77   size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
78   mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
79   size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
80   dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
81   size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
82   matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
83   size_t dep_size = sizeof(DictExtPara);
84   dep_size = align_to_size_t(dep_size) / sizeof(size_t);
85 
86   // share_buf's size is determined by the buffers for search.
87   share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
88 
89   if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
90       NULL == share_buf_)
91     return false;
92 
93   // The buffers for search are based on the share buffer
94   mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
95   dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
96   matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
97   dep_ = reinterpret_cast<DictExtPara*>
98       (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
99 
100   // The prediction buffer is also based on the share buffer.
101   npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
102   npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
103       sizeof(size_t) / sizeof(NPredictItem);
104   return true;
105 }
106 
free_resource()107 void MatrixSearch::free_resource() {
108   if (NULL != dict_trie_)
109     delete dict_trie_;
110 
111   if (NULL != user_dict_)
112     delete user_dict_;
113 
114   if (NULL != spl_parser_)
115     delete spl_parser_;
116 
117   if (NULL != share_buf_)
118     delete [] share_buf_;
119 
120   reset_pointers_to_null();
121 }
122 
init(const char * fn_sys_dict,const char * fn_usr_dict)123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
124   if (NULL == fn_sys_dict || NULL == fn_usr_dict)
125     return false;
126 
127   if (!alloc_resource())
128     return false;
129 
130   if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
131     return false;
132 
133   // If engine fails to load the user dictionary, reset the user dictionary
134   // to NULL.
135   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
136     delete user_dict_;
137     user_dict_ = NULL;
138   } else{
139     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
140   }
141 
142   reset_search0();
143 
144   inited_ = true;
145   return true;
146 }
147 
init_fd(int sys_fd,long start_offset,long length,const char * fn_usr_dict)148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
149                            const char *fn_usr_dict) {
150   if (NULL == fn_usr_dict)
151     return false;
152 
153   if (!alloc_resource())
154     return false;
155 
156   if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
157     return false;
158 
159   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
160     delete user_dict_;
161     user_dict_ = NULL;
162   } else {
163     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
164   }
165 
166   reset_search0();
167 
168   inited_ = true;
169   return true;
170 }
171 
init_user_dictionary(const char * fn_usr_dict)172 void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) {
173   assert(inited_);
174 
175   if (NULL != user_dict_) {
176     delete user_dict_;
177     user_dict_ = NULL;
178   }
179 
180   if (NULL != fn_usr_dict) {
181     user_dict_ = static_cast<AtomDictBase*>(new UserDict());
182     if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
183       delete user_dict_;
184       user_dict_ = NULL;
185     }
186   }
187 
188   reset_search0();
189 }
190 
is_user_dictionary_enabled() const191 bool MatrixSearch::is_user_dictionary_enabled() const {
192   return NULL != user_dict_;
193 }
194 
set_max_lens(size_t max_sps_len,size_t max_hzs_len)195 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
196   if (0 != max_sps_len)
197     max_sps_len_ = max_sps_len;
198   if (0 != max_hzs_len)
199     max_hzs_len_ = max_hzs_len;
200 }
201 
close()202 void MatrixSearch::close() {
203   flush_cache();
204   free_resource();
205   inited_ = false;
206 }
207 
flush_cache()208 void MatrixSearch::flush_cache() {
209   if (NULL != user_dict_)
210     user_dict_->flush_cache();
211 }
212 
set_xi_an_switch(bool xi_an_enabled)213 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
214   xi_an_enabled_ = xi_an_enabled;
215 }
216 
get_xi_an_switch()217 bool MatrixSearch::get_xi_an_switch() {
218   return xi_an_enabled_;
219 }
220 
reset_search()221 bool MatrixSearch::reset_search() {
222   if (!inited_)
223     return false;
224   return reset_search0();
225 }
226 
reset_search0()227 bool MatrixSearch::reset_search0() {
228     if (!inited_)
229         return false;
230 
231     pys_decoded_len_ = 0;
232     mtrx_nd_pool_used_ = 0;
233     dmi_pool_used_ = 0;
234 
235     // Get a MatrixNode from the pool
236     matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
237     matrix_[0].mtrx_nd_num = 1;
238     mtrx_nd_pool_used_ += 1;
239 
240     // Update the node, and make it to be a starting node
241     MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
242     node->id = 0;
243     node->score = 0;
244     node->from = NULL;
245     node->step = 0;
246     node->dmi_fr = (PoolPosType)-1;
247 
248     matrix_[0].dmi_pos = 0;
249     matrix_[0].dmi_num = 0;
250     matrix_[0].dmi_has_full_id = 1;
251     matrix_[0].mtrx_nd_fixed = node;
252 
253     lma_start_[0] = 0;
254     fixed_lmas_ = 0;
255     spl_start_[0] = 0;
256     fixed_hzs_ = 0;
257 
258     dict_trie_->reset_milestones(0, 0);
259     if (NULL != user_dict_)
260       user_dict_->reset_milestones(0, 0);
261 
262     return true;
263 }
264 
reset_search(size_t ch_pos,bool clear_fixed_this_step,bool clear_dmi_this_step,bool clear_mtrx_this_step)265 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
266                                 bool clear_dmi_this_step,
267                                 bool clear_mtrx_this_step) {
268   if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
269     return false;
270 
271   if (0 == ch_pos) {
272     reset_search0();
273   } else {
274     // Prepare mile stones of this step to clear.
275     MileStoneHandle *dict_handles_to_clear = NULL;
276     if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
277       dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
278     }
279 
280     // If there are more steps, and this step is not allowed to clear, find
281     // milestones of next step.
282     if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
283       dict_handles_to_clear = NULL;
284       if (matrix_[ch_pos + 1].dmi_num > 0) {
285         dict_handles_to_clear =
286             dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
287       }
288     }
289 
290     if (NULL != dict_handles_to_clear) {
291       dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
292       if (NULL != user_dict_)
293         user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
294     }
295 
296     pys_decoded_len_ = ch_pos;
297 
298     if (clear_dmi_this_step) {
299       dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
300                        + matrix_[ch_pos - 1].dmi_num;
301       matrix_[ch_pos].dmi_num = 0;
302     } else {
303       dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
304     }
305 
306     if (clear_mtrx_this_step) {
307       mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
308                            + matrix_[ch_pos - 1].mtrx_nd_num;
309       matrix_[ch_pos].mtrx_nd_num = 0;
310     } else {
311       mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
312                            + matrix_[ch_pos].mtrx_nd_num;
313     }
314 
315     // Modify fixed_hzs_
316     if (fixed_hzs_ > 0 &&
317         ((kLemmaIdComposing != lma_id_[0]) ||
318          (kLemmaIdComposing == lma_id_[0] &&
319           spl_start_[c_phrase_.length] <= ch_pos))) {
320       size_t fixed_ch_pos = ch_pos;
321       if (clear_fixed_this_step)
322         fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
323       while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
324         fixed_ch_pos--;
325 
326       fixed_lmas_ = 0;
327       fixed_hzs_ = 0;
328       if (fixed_ch_pos > 0) {
329         while (spl_start_[fixed_hzs_] < fixed_ch_pos)
330           fixed_hzs_++;
331         assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
332 
333         while (lma_start_[fixed_lmas_] < fixed_hzs_)
334           fixed_lmas_++;
335         assert(lma_start_[fixed_lmas_] == fixed_hzs_);
336       }
337 
338       // Re-search the Pinyin string for the unlocked lemma
339       // which was previously fixed.
340       //
341       // Prepare mile stones of this step to clear.
342       MileStoneHandle *dict_handles_to_clear = NULL;
343       if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
344           matrix_[fixed_ch_pos].dmi_num > 0) {
345         dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
346       }
347 
348       // If there are more steps, and this step is not allowed to clear, find
349       // milestones of next step.
350       if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
351         dict_handles_to_clear = NULL;
352         if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
353           dict_handles_to_clear =
354               dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
355         }
356       }
357 
358       if (NULL != dict_handles_to_clear) {
359         dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
360         if (NULL != user_dict_)
361           user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
362       }
363 
364 
365       pys_decoded_len_ = fixed_ch_pos;
366 
367       if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
368         dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
369                          + matrix_[fixed_ch_pos - 1].dmi_num;
370         matrix_[fixed_ch_pos].dmi_num = 0;
371       } else {
372         dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
373             matrix_[fixed_ch_pos].dmi_num;
374       }
375 
376       if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
377         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
378                              + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
379         matrix_[fixed_ch_pos].mtrx_nd_num = 0;
380       } else {
381         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
382                              + matrix_[fixed_ch_pos].mtrx_nd_num;
383       }
384 
385       for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
386         add_char(pys_[re_pos]);
387       }
388     } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
389       for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
390         uint16 splpos_begin = c_phrase_.sublma_start[subpos];
391         uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
392         for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
393           // If ch_pos is in this spelling
394           uint16 spl_start = c_phrase_.spl_start[splpos];
395           uint16 spl_end = c_phrase_.spl_start[splpos + 1];
396           if (ch_pos >= spl_start && ch_pos < spl_end) {
397             // Clear everything after this position
398             c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
399             c_phrase_.sublma_start[subpos + 1] = splpos;
400             c_phrase_.sublma_num = subpos + 1;
401             c_phrase_.length = splpos;
402 
403             if (splpos == splpos_begin) {
404               c_phrase_.sublma_num = subpos;
405             }
406           }
407         }
408       }
409 
410       // Extend the composing phrase.
411       reset_search0();
412       dmi_c_phrase_ = true;
413       uint16 c_py_pos = 0;
414       while (c_py_pos < spl_start_[c_phrase_.length]) {
415         bool b_ac_tmp = add_char(pys_[c_py_pos]);
416         assert(b_ac_tmp);
417         c_py_pos++;
418       }
419       dmi_c_phrase_ = false;
420 
421       lma_id_num_ = 1;
422       fixed_lmas_ = 1;
423       fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
424       fixed_hzs_ = c_phrase_.length;
425       lma_start_[1] = fixed_hzs_;
426       lma_id_[0] = kLemmaIdComposing;
427       matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
428           matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
429     }
430   }
431 
432   return true;
433 }
434 
del_in_pys(size_t start,size_t len)435 void MatrixSearch::del_in_pys(size_t start, size_t len) {
436   while (start < kMaxRowNum - len && '\0' != pys_[start]) {
437     pys_[start] = pys_[start + len];
438     start++;
439   }
440 }
441 
search(const char * py,size_t py_len)442 size_t MatrixSearch::search(const char *py, size_t py_len) {
443   if (!inited_ || NULL == py)
444     return 0;
445 
446   // If the search Pinyin string is too long, it will be truncated.
447   if (py_len > kMaxRowNum - 1)
448     py_len = kMaxRowNum - 1;
449 
450   // Compare the new string with the previous one. Find their prefix to
451   // increase search efficiency.
452   size_t ch_pos = 0;
453   for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
454     if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
455       break;
456   }
457 
458   bool clear_fix = true;
459   if (ch_pos == pys_decoded_len_)
460     clear_fix = false;
461 
462   reset_search(ch_pos, clear_fix, false, false);
463 
464   memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
465   pys_[py_len] = '\0';
466 
467   while ('\0' != pys_[ch_pos]) {
468     if (!add_char(py[ch_pos])) {
469       pys_decoded_len_ = ch_pos;
470       break;
471     }
472     ch_pos++;
473   }
474 
475   // Get spelling ids and starting positions.
476   get_spl_start_id();
477 
478   // If there are too many spellings, remove the last letter until the spelling
479   // number is acceptable.
480   while (spl_id_num_ > 9) {
481     py_len--;
482     reset_search(py_len, false, false, false);
483     pys_[py_len] = '\0';
484     get_spl_start_id();
485   }
486 
487   prepare_candidates();
488 
489   if (kPrintDebug0) {
490     printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
491     printf("--DMI Pool Used: %d\n", dmi_pool_used_);
492 
493     if (kPrintDebug1) {
494       for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
495         debug_print_dmi(pos, 1);
496       }
497     }
498   }
499 
500   return ch_pos;
501 }
502 
delsearch(size_t pos,bool is_pos_in_splid,bool clear_fixed_this_step)503 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
504                                bool clear_fixed_this_step) {
505   if (!inited_)
506     return 0;
507 
508   size_t reset_pos = pos;
509 
510   // Out of range for both Pinyin mode and Spelling id mode.
511   if (pys_decoded_len_ <= pos) {
512     del_in_pys(pos, 1);
513 
514     reset_pos = pys_decoded_len_;
515     // Decode the string after the un-decoded position
516     while ('\0' != pys_[reset_pos]) {
517       if (!add_char(pys_[reset_pos])) {
518         pys_decoded_len_ = reset_pos;
519         break;
520       }
521       reset_pos++;
522     }
523     get_spl_start_id();
524     prepare_candidates();
525     return pys_decoded_len_;
526   }
527 
528   // Spelling id mode, but out of range.
529   if (is_pos_in_splid && spl_id_num_ <= pos)
530     return pys_decoded_len_;
531 
532   // Begin to handle two modes respectively.
533   // Pinyin mode by default
534   size_t c_py_len = 0;  // The length of composing phrase's Pinyin
535   size_t del_py_len = 1;
536   if (!is_pos_in_splid) {
537     // Pinyin mode is only allowed to delete beyond the fixed lemmas.
538     if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
539       return pys_decoded_len_;
540 
541     del_in_pys(pos, 1);
542 
543     // If the deleted character is just the one after the last fixed lemma
544     if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
545       // If all fixed lemmas have been merged, and the caller of the function
546       // request to unlock the last fixed lemma.
547       if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
548         // Unlock the last sub lemma in the composing phrase. Because it is not
549         // easy to unlock it directly. Instead, we re-decode the modified
550         // composing phrase.
551         c_phrase_.sublma_num--;
552         c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
553         reset_pos = spl_start_[c_phrase_.length];
554         c_py_len = reset_pos;
555       }
556     }
557   } else {
558     del_py_len = spl_start_[pos + 1] - spl_start_[pos];
559 
560     del_in_pys(spl_start_[pos], del_py_len);
561 
562     if (pos >= lma_start_[fixed_lmas_]) {
563       c_py_len = 0;
564       reset_pos = spl_start_[pos + 1] - del_py_len;
565     } else {
566       c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
567       reset_pos = c_py_len;
568       if (c_py_len > 0)
569         merge_fixed_lmas(pos);
570     }
571   }
572 
573   if (c_py_len > 0) {
574     assert(c_phrase_.length > 0 && c_py_len ==
575         c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
576     // The composing phrase is valid, reset all search space,
577     // and begin a new search which will only extend the composing
578     // phrase.
579     reset_search0();
580 
581     dmi_c_phrase_ = true;
582     // Extend the composing phrase.
583     uint16 c_py_pos = 0;
584     while (c_py_pos < c_py_len) {
585       bool b_ac_tmp = add_char(pys_[c_py_pos]);
586       assert(b_ac_tmp);
587       c_py_pos++;
588     }
589     dmi_c_phrase_ = false;
590 
591     // Fixd the composing phrase as the first choice.
592     lma_id_num_ = 1;
593     fixed_lmas_ = 1;
594     fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
595     fixed_hzs_ = c_phrase_.length;
596     lma_start_[1] = fixed_hzs_;
597     lma_id_[0] = kLemmaIdComposing;
598     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
599         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
600   } else {
601     // Reseting search only clear pys_decoded_len_, but the string is kept.
602     reset_search(reset_pos, clear_fixed_this_step, false, false);
603   }
604 
605   // Decode the string after the delete position.
606   while ('\0' != pys_[reset_pos]) {
607     if (!add_char(pys_[reset_pos])) {
608       pys_decoded_len_ = reset_pos;
609       break;
610     }
611     reset_pos++;
612   }
613 
614   get_spl_start_id();
615   prepare_candidates();
616   return pys_decoded_len_;
617 }
618 
get_candidate_num()619 size_t MatrixSearch::get_candidate_num() {
620   if (!inited_ || 0 == pys_decoded_len_ ||
621       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
622     return 0;
623 
624   return 1 + lpi_total_;
625 }
626 
get_candidate(size_t cand_id,char16 * cand_str,size_t max_len)627 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
628                                     size_t max_len) {
629   if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
630     return NULL;
631 
632   if (0 == cand_id) {
633     return get_candidate0(cand_str, max_len, NULL, false);
634   } else {
635     cand_id--;
636   }
637 
638   // For this case: the current sentence is a word only, and the user fixed it,
639   // so the result will be fixed to the sentence space, and
640   // lpi_total_ will be set to 0.
641   if (0 == lpi_total_) {
642     return get_candidate0(cand_str, max_len, NULL, false);
643   }
644 
645   LemmaIdType id = lpi_items_[cand_id].id;
646   char16 s[kMaxLemmaSize + 1];
647 
648   uint16 s_len = lpi_items_[cand_id].lma_len;
649   if (s_len > 1) {
650     s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
651   } else {
652     // For a single character, Hanzi is ready.
653     s[0] = lpi_items_[cand_id].hanzi;
654     s[1] = static_cast<char16>(0);
655   }
656 
657   if (s_len > 0 &&  max_len > s_len) {
658     utf16_strncpy(cand_str, s, s_len);
659     cand_str[s_len] = (char16)'\0';
660     return cand_str;
661   }
662 
663   return NULL;
664 }
665 
update_dict_freq()666 void MatrixSearch::update_dict_freq() {
667   if (NULL != user_dict_) {
668     // Update the total frequency of all lemmas, including system lemmas and
669     // user dictionary lemmas.
670     size_t total_freq = user_dict_->get_total_lemma_count();
671     dict_trie_->set_total_lemma_count_of_others(total_freq);
672   }
673 }
674 
add_lma_to_userdict(uint16 lma_fr,uint16 lma_to,float score)675 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
676                                        float score) {
677   if (lma_to - lma_fr <= 1 || NULL == user_dict_)
678     return false;
679 
680   char16 word_str[kMaxLemmaSize + 1];
681   uint16 spl_ids[kMaxLemmaSize];
682 
683   uint16 spl_id_fr = 0;
684 
685   for (uint16 pos = lma_fr; pos < lma_to; pos++) {
686     LemmaIdType lma_id = lma_id_[pos];
687     if (is_user_lemma(lma_id)) {
688       user_dict_->update_lemma(lma_id, 1, true);
689     }
690     uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
691     utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
692 
693     uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
694                                kMaxLemmaSize + 1 - spl_id_fr);
695     assert(tmp == lma_len);
696 
697     tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
698     if (tmp != lma_len) {
699       return false;
700     }
701 
702     spl_id_fr += lma_len;
703   }
704 
705   assert(spl_id_fr <= kMaxLemmaSize);
706 
707   return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
708                                  spl_id_fr, 1);
709 }
710 
debug_print_dmi(PoolPosType dmi_pos,uint16 nest_level)711 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
712   if (dmi_pos >= dmi_pool_used_) return;
713 
714   DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
715 
716   if (1 == nest_level) {
717     printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
718   }
719   if (dmi->dict_level > 1) {
720     debug_print_dmi(dmi->dmi_fr, nest_level + 1);
721   }
722   printf("---%d\n", dmi->dict_level);
723   printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
724   printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
725          get_spelling_str(dmi->spl_id), dmi->spl_id);
726   printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
727   if (1 == nest_level) {
728     printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
729   }
730 }
731 
try_add_cand0_to_userdict()732 bool MatrixSearch::try_add_cand0_to_userdict() {
733   size_t new_cand_num = get_candidate_num();
734   if (fixed_hzs_ > 0 && 1 == new_cand_num) {
735     float score_from = 0;
736     uint16 lma_id_from = 0;
737     uint16 pos = 0;
738     bool modified = false;
739     while (pos < fixed_lmas_) {
740       if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
741           static_cast<uint16>(kMaxLemmaSize)) {
742         float score_to_add =
743             mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
744             .mtrx_nd_pos].score - score_from;
745         if (modified) {
746           score_to_add += 1.0;
747           if (score_to_add > NGram::kMaxScore) {
748             score_to_add = NGram::kMaxScore;
749           }
750           add_lma_to_userdict(lma_id_from, pos, score_to_add);
751         }
752         lma_id_from = pos;
753         score_from += score_to_add;
754 
755         // Clear the flag for next user lemma.
756         modified = false;
757       }
758 
759       if (0 == fixed_lmas_no1_[pos]) {
760         modified = true;
761       }
762       pos++;
763     }
764 
765     // Single-char word is not allowed to add to userdict.
766     if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
767       float score_to_add =
768           mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
769           .mtrx_nd_pos].score - score_from;
770       if (modified) {
771         score_to_add += 1.0;
772         if (score_to_add > NGram::kMaxScore) {
773           score_to_add = NGram::kMaxScore;
774         }
775         add_lma_to_userdict(lma_id_from, pos, score_to_add);
776       }
777     }
778   }
779   return true;
780 }
781 
782 // Choose a candidate, and give new candidates for next step.
783 // If user finishes selection, we will try to communicate with user dictionary
784 // to add new items or update score of some existing items.
785 //
786 // Basic rule:
787 // 1. If user selects the first choice:
788 //    1.1. If the first choice is not a sentence, instead, it is a lemma:
789 //         1.1.1. If the first choice is a user lemma, notify the user
790 //                dictionary that a user lemma is hit, and add occuring count
791 //                by 1.
792 //         1.1.2. If the first choice is a system lemma, do nothing.
793 //    1.2. If the first choice is a sentence containing more than one lemma:
794 //         1.2.1. The whole sentence will be added as a user lemma. If the
795 //                sentence contains user lemmas, -> hit, and add occuring count
796 //                by 1.
choose(size_t cand_id)797 size_t MatrixSearch::choose(size_t cand_id) {
798   if (!inited_ || 0 == pys_decoded_len_)
799     return 0;
800 
801   if (0 == cand_id) {
802     fixed_hzs_ = spl_id_num_;
803     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
804         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
805     for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
806       fixed_lmas_no1_[pos] = 1;
807     }
808     fixed_lmas_ = lma_id_num_;
809     lpi_total_ = 0;  // Clean all other candidates.
810 
811     // 1. It is the first choice
812     if (1 == lma_id_num_) {
813       // 1.1. The first choice is not a sentence but a lemma
814       if (is_user_lemma(lma_id_[0])) {
815         // 1.1.1. The first choice is a user lemma, notify the user dictionary
816         // that it is hit.
817         if (NULL != user_dict_)
818           user_dict_->update_lemma(lma_id_[0], 1, true);
819       } else {
820         // 1.1.2. do thing for a system lemma.
821       }
822     } else {
823       // 1.2. The first choice is a sentence.
824       // 1.2.1 Try to add the whole sentence to user dictionary, the whole
825       // sentence may be splitted into many items.
826       if (NULL != user_dict_) {
827         try_add_cand0_to_userdict();
828       }
829     }
830     update_dict_freq();
831     return 1;
832   } else {
833     cand_id--;
834   }
835 
836   // 2. It is not the full sentence candidate.
837   // Find the length of the candidate.
838   LemmaIdType id_chosen = lpi_items_[cand_id].id;
839   LmaScoreType score_chosen = lpi_items_[cand_id].psb;
840   size_t cand_len = lpi_items_[cand_id].lma_len;
841 
842   assert(cand_len > 0);
843 
844   // Notify the atom dictionary that this item is hit.
845   if (is_user_lemma(id_chosen)) {
846     if (NULL != user_dict_) {
847       user_dict_->update_lemma(id_chosen, 1, true);
848     }
849     update_dict_freq();
850   }
851 
852   // 3. Fixed the chosen item.
853   // 3.1 Get the steps number.
854   size_t step_fr = spl_start_[fixed_hzs_];
855   size_t step_to = spl_start_[fixed_hzs_ + cand_len];
856 
857   // 3.2 Save the length of the original string.
858   size_t pys_decoded_len = pys_decoded_len_;
859 
860   // 3.2 Reset the space of the fixed part.
861   reset_search(step_to, false, false, true);
862 
863   // 3.3 For the last character of the fixed part, the previous DMI
864   // information will be kept, while the MTRX information will be re-extended,
865   // and only one node will be extended.
866   matrix_[step_to].mtrx_nd_num = 0;
867 
868   LmaPsbItem lpi_item;
869   lpi_item.psb = score_chosen;
870   lpi_item.id = id_chosen;
871 
872   PoolPosType step_to_dmi_fr = match_dmi(step_to,
873                                          spl_id_ + fixed_hzs_, cand_len);
874   //assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
875 
876   extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
877                  step_to_dmi_fr, step_to);
878 
879   matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
880   mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
881                        matrix_[step_to].mtrx_nd_num;
882 
883   if (id_chosen == lma_id_[fixed_lmas_])
884     fixed_lmas_no1_[fixed_lmas_] = 1;
885   else
886     fixed_lmas_no1_[fixed_lmas_] = 0;
887   lma_id_[fixed_lmas_] = id_chosen;
888   lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
889   fixed_lmas_++;
890   fixed_hzs_ = fixed_hzs_ + cand_len;
891 
892   while (step_to != pys_decoded_len) {
893     bool b = add_char(pys_[step_to]);
894     assert(b);
895     step_to++;
896   }
897 
898   if (fixed_hzs_ < spl_id_num_) {
899     prepare_candidates();
900   } else {
901     lpi_total_ = 0;
902     if (NULL != user_dict_) {
903       try_add_cand0_to_userdict();
904     }
905   }
906 
907   return get_candidate_num();
908 }
909 
cancel_last_choice()910 size_t MatrixSearch::cancel_last_choice() {
911   if (!inited_ || 0 == pys_decoded_len_)
912     return 0;
913 
914   size_t step_start = 0;
915   if (fixed_hzs_ > 0) {
916     size_t step_end = spl_start_[fixed_hzs_];
917     MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
918     assert(NULL != end_node);
919 
920     step_start = end_node->from->step;
921 
922     if (step_start > 0) {
923       DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
924       fixed_hzs_ -= dmi->dict_level;
925     } else {
926       fixed_hzs_ = 0;
927     }
928 
929     reset_search(step_start, false, false, false);
930 
931     while (pys_[step_start] != '\0') {
932       bool b = add_char(pys_[step_start]);
933       assert(b);
934       step_start++;
935     }
936 
937     prepare_candidates();
938   }
939   return get_candidate_num();
940 }
941 
get_fixedlen()942 size_t MatrixSearch::get_fixedlen() {
943   if (!inited_ || 0 == pys_decoded_len_)
944     return 0;
945   return fixed_hzs_;
946 }
947 
prepare_add_char(char ch)948 bool MatrixSearch::prepare_add_char(char ch) {
949   if (pys_decoded_len_ >= kMaxRowNum - 1 ||
950       (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
951     return false;
952 
953   if (dmi_pool_used_ >= kDmiPoolSize) return false;
954 
955   pys_[pys_decoded_len_] = ch;
956   pys_decoded_len_++;
957 
958   MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
959   mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
960   mtrx_this_row->mtrx_nd_num = 0;
961   mtrx_this_row->dmi_pos = dmi_pool_used_;
962   mtrx_this_row->dmi_num = 0;
963   mtrx_this_row->dmi_has_full_id = 0;
964 
965   return true;
966 }
967 
is_split_at(uint16 pos)968 bool MatrixSearch::is_split_at(uint16 pos) {
969   return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
970 }
971 
fill_dmi(DictMatchInfo * dmi,MileStoneHandle * handles,PoolPosType dmi_fr,uint16 spl_id,uint16 node_num,unsigned char dict_level,bool splid_end_split,unsigned char splstr_len,unsigned char all_full_id)972 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
973                             PoolPosType dmi_fr, uint16 spl_id,
974                             uint16 node_num, unsigned char dict_level,
975                             bool splid_end_split, unsigned char splstr_len,
976                             unsigned char all_full_id) {
977   dmi->dict_handles[0] = handles[0];
978   dmi->dict_handles[1] = handles[1];
979   dmi->dmi_fr = dmi_fr;
980   dmi->spl_id = spl_id;
981   dmi->dict_level = dict_level;
982   dmi->splid_end_split = splid_end_split ? 1 : 0;
983   dmi->splstr_len = splstr_len;
984   dmi->all_full_id = all_full_id;
985   dmi->c_phrase = 0;
986 }
987 
add_char(char ch)988 bool MatrixSearch::add_char(char ch) {
989   if (!prepare_add_char(ch))
990     return false;
991   return add_char_qwerty();
992 }
993 
add_char_qwerty()994 bool MatrixSearch::add_char_qwerty() {
995   matrix_[pys_decoded_len_].mtrx_nd_num = 0;
996 
997   bool spl_matched = false;
998   uint16 longest_ext = 0;
999   // Extend the search matrix, from the oldest unfixed row. ext_len means
1000   // extending length.
1001   for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
1002     if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
1003       continue;
1004 
1005     // Refer to the declaration of the variable dmi_has_full_id for the
1006     // explanation of this piece of code. In one word, it is used to prevent
1007     // from the unwise extending of "shoud ou" but allow the reasonable
1008     // extending of "heng ao", "lang a", etc.
1009     if (ext_len > 1 && 0 != longest_ext &&
1010         0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
1011       if (xi_an_enabled_)
1012         continue;
1013       else
1014         break;
1015     }
1016 
1017     uint16 oldrow = pys_decoded_len_ - ext_len;
1018 
1019     // 0. If that row is before the last fixed step, ignore.
1020     if (spl_start_[fixed_hzs_] > oldrow)
1021       continue;
1022 
1023     // 1. Check if that old row has valid MatrixNode. If no, means that row is
1024     // not a boundary, either a word boundary or a spelling boundary.
1025     // If it is for extending composing phrase, it's OK to ignore the 0.
1026     if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
1027       continue;
1028 
1029     // 2. Get spelling id(s) for the last ext_len chars.
1030     uint16 spl_idx;
1031     bool is_pre = false;
1032     spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
1033                                             ext_len, &is_pre);
1034     if (is_pre)
1035       spl_matched = true;
1036 
1037     if (0 == spl_idx)
1038       continue;
1039 
1040     bool splid_end_split = is_split_at(oldrow + ext_len);
1041 
1042     // 3. Extend the DMI nodes of that old row
1043     // + 1 is to extend an extra node from the root
1044     for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
1045          dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
1046          dmi_pos++) {
1047       DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
1048       if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
1049         dmi = NULL;  // The last one, NULL means extending from the root.
1050       } else {
1051         // If the dmi is covered by the fixed arrange, ignore it.
1052         if (fixed_hzs_ > 0 &&
1053             pys_decoded_len_ - ext_len - dmi->splstr_len <
1054             spl_start_[fixed_hzs_]) {
1055           continue;
1056         }
1057         // If it is not in mode for composing phrase, and the source DMI node
1058         // is marked for composing phrase, ignore this node.
1059         if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
1060           continue;
1061         }
1062       }
1063 
1064       // For example, if "gao" is extended, "g ao" is not allowed.
1065       // or "zh" has been passed, "z h" is not allowed.
1066       // Both word and word-connection will be prevented.
1067       if (longest_ext > ext_len) {
1068         if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
1069           continue;
1070         }
1071 
1072         // "z h" is not allowed.
1073         if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
1074           continue;
1075         }
1076       }
1077 
1078       dep_->splids_extended = 0;
1079       if (NULL != dmi) {
1080         uint16 prev_ids_num = dmi->dict_level;
1081         if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
1082             (dmi_c_phrase_ && prev_ids_num >=  kMaxRowNum)) {
1083           continue;
1084         }
1085 
1086         DictMatchInfo *d = dmi;
1087         while (d) {
1088           dep_->splids[--prev_ids_num] = d->spl_id;
1089           if ((PoolPosType)-1 == d->dmi_fr)
1090             break;
1091           d = dmi_pool_ + d->dmi_fr;
1092         }
1093         assert(0 == prev_ids_num);
1094         dep_->splids_extended = dmi->dict_level;
1095       }
1096       dep_->splids[dep_->splids_extended] = spl_idx;
1097       dep_->ext_len = ext_len;
1098       dep_->splid_end_split = splid_end_split;
1099 
1100       dep_->id_num = 1;
1101       dep_->id_start = spl_idx;
1102       if (spl_trie_->is_half_id(spl_idx)) {
1103         // Get the full id list
1104         dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
1105         assert(dep_->id_num > 0);
1106       }
1107 
1108       uint16 new_dmi_num;
1109 
1110       new_dmi_num = extend_dmi(dep_, dmi);
1111 
1112       if (new_dmi_num > 0) {
1113         if (dmi_c_phrase_) {
1114           dmi_pool_[dmi_pool_used_].c_phrase = 1;
1115         }
1116         matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
1117         dmi_pool_used_ += new_dmi_num;
1118 
1119         if (!spl_trie_->is_half_id(spl_idx))
1120           matrix_[pys_decoded_len_].dmi_has_full_id = 1;
1121       }
1122 
1123       // If get candiate lemmas, try to extend the path
1124       if (lpi_total_ > 0) {
1125         uint16 fr_row;
1126         if (NULL == dmi) {
1127           fr_row = oldrow;
1128         } else {
1129           assert(oldrow >= dmi->splstr_len);
1130           fr_row = oldrow - dmi->splstr_len;
1131         }
1132         for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
1133              mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
1134              matrix_[fr_row].mtrx_nd_num;
1135              mtrx_nd_pos++) {
1136           MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
1137 
1138           extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
1139                          dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
1140           if (longest_ext == 0)
1141             longest_ext = ext_len;
1142         }
1143       }
1144     }  // for dmi_pos
1145   }  // for ext_len
1146   mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
1147 
1148   if (dmi_c_phrase_)
1149     return true;
1150 
1151   return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
1152 }
1153 
prepare_candidates()1154 void MatrixSearch::prepare_candidates() {
1155   // Get candiates from the first un-fixed step.
1156   uint16 lma_size_max = kMaxLemmaSize;
1157   if (lma_size_max > spl_id_num_ - fixed_hzs_)
1158     lma_size_max = spl_id_num_ - fixed_hzs_;
1159 
1160   uint16 lma_size = lma_size_max;
1161 
1162   // If the full sentense candidate's unfixed part may be the same with a normal
1163   // lemma. Remove the lemma candidate in this case.
1164   char16 fullsent[kMaxLemmaSize + 1];
1165   char16 *pfullsent = NULL;
1166   uint16 sent_len;
1167   pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
1168 
1169   // If the unfixed part contains more than one ids, it is not necessary to
1170   // check whether a lemma's string is the same to the unfixed part of the full
1171   // sentence candidate, so, set it to NULL;
1172   if (sent_len > kMaxLemmaSize)
1173     pfullsent = NULL;
1174 
1175   lpi_total_ = 0;
1176   size_t lpi_num_full_match = 0;  // Number of items which are fully-matched.
1177   while (lma_size > 0) {
1178     size_t lma_num;
1179     lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
1180                        lpi_items_ + lpi_total_,
1181                        size_t(kMaxLmaPsbItems - lpi_total_),
1182                        pfullsent, lma_size == lma_size_max);
1183 
1184     if (lma_num > 0) {
1185       lpi_total_ += lma_num;
1186       // For next lemma candidates which are not the longest, it is not
1187       // necessary to compare with the full sentence candiate.
1188       pfullsent = NULL;
1189     }
1190     if (lma_size == lma_size_max) {
1191       lpi_num_full_match = lpi_total_;
1192     }
1193     lma_size--;
1194   }
1195 
1196   // Sort those partially-matched items by their unified scores.
1197   myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
1198           sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
1199 
1200   if (kPrintDebug0) {
1201     printf("-----Prepare candidates, score:\n");
1202     for (size_t a = 0; a < lpi_total_; a++) {
1203       printf("[%03d]%d    ", a, lpi_items_[a].psb);
1204       if ((a + 1) % 6 == 0) printf("\n");
1205     }
1206     printf("\n");
1207   }
1208 
1209   if (kPrintDebug0) {
1210     printf("--- lpi_total_ = %d\n", lpi_total_);
1211   }
1212 }
1213 
get_pystr(size_t * decoded_len)1214 const char* MatrixSearch::get_pystr(size_t *decoded_len) {
1215   if (!inited_ || NULL == decoded_len)
1216     return NULL;
1217 
1218   *decoded_len = pys_decoded_len_;
1219   return pys_;
1220 }
1221 
merge_fixed_lmas(size_t del_spl_pos)1222 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
1223   if (fixed_lmas_ == 0)
1224     return;
1225   // Update spelling segmentation information first.
1226   spl_id_num_ -= 1;
1227   uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
1228   for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
1229     spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
1230     if (pos == spl_id_num_)
1231       break;
1232     spl_id_[pos] = spl_id_[pos + 1];
1233   }
1234 
1235   // Begin to merge.
1236   uint16 phrase_len = 0;
1237 
1238   // Update the spelling ids to the composing phrase.
1239   // We need to convert these ids into full id in the future.
1240   memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
1241   memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
1242 
1243   // If composing phrase has not been created, first merge all fixed
1244   //  lemmas into a composing phrase without deletion.
1245   if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
1246     uint16 bp = 1;  // Begin position of real fixed lemmas.
1247     // There is no existing composing phrase.
1248     if (kLemmaIdComposing != lma_id_[0]) {
1249       c_phrase_.sublma_num = 0;
1250       bp = 0;
1251     }
1252 
1253     uint16 sub_num = c_phrase_.sublma_num;
1254     for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
1255       c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
1256       if (lma_start_[pos] > del_spl_pos) {
1257         c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
1258       }
1259 
1260       if (pos == fixed_lmas_)
1261         break;
1262 
1263       uint16 lma_len;
1264       char16 *lma_str = c_phrase_.chn_str +
1265           c_phrase_.sublma_start[sub_num] + phrase_len;
1266 
1267       lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
1268       assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
1269       phrase_len += lma_len;
1270     }
1271     assert(phrase_len == lma_start_[fixed_lmas_]);
1272     c_phrase_.length = phrase_len;  // will be deleted by 1
1273     c_phrase_.sublma_num += fixed_lmas_ - bp;
1274   } else {
1275     for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
1276       if (c_phrase_.sublma_start[pos] > del_spl_pos) {
1277         c_phrase_.sublma_start[pos] -= 1;
1278       }
1279     }
1280     phrase_len = c_phrase_.length;
1281   }
1282 
1283   assert(phrase_len > 0);
1284   if (1 == phrase_len) {
1285     // After the only one is deleted, nothing will be left.
1286     fixed_lmas_ = 0;
1287     return;
1288   }
1289 
1290   // Delete the Chinese character in the merged phrase.
1291   // The corresponding elements in spl_ids and spl_start of the
1292   // phrase have been deleted.
1293   char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
1294   for (uint16 pos = 0;
1295       pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
1296       pos++) {
1297     chn_str[pos] = chn_str[pos + 1];
1298   }
1299   c_phrase_.length -= 1;
1300 
1301   // If the deleted spelling id is in a sub lemma which contains more than
1302   // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
1303   // which only contains 1 id, the whole sub lemma needs to be deleted, so
1304   // del_a_sub will be true.
1305   bool del_a_sub = false;
1306   for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
1307     if (c_phrase_.sublma_start[pos - 1] ==
1308         c_phrase_.sublma_start[pos]) {
1309       del_a_sub = true;
1310     }
1311     if (del_a_sub) {
1312       c_phrase_.sublma_start[pos - 1] =
1313           c_phrase_.sublma_start[pos];
1314     }
1315   }
1316   if (del_a_sub)
1317     c_phrase_.sublma_num -= 1;
1318 
1319   return;
1320 }
1321 
get_spl_start_id()1322 void MatrixSearch::get_spl_start_id() {
1323   lma_id_num_ = 0;
1324   lma_start_[0] = 0;
1325 
1326   spl_id_num_ = 0;
1327   spl_start_[0] = 0;
1328   if (!inited_ || 0 == pys_decoded_len_ ||
1329       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
1330     return;
1331 
1332   // Calculate number of lemmas and spellings
1333   // Only scan those part which is not fixed.
1334   lma_id_num_ = fixed_lmas_;
1335   spl_id_num_ = fixed_hzs_;
1336 
1337   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1338   while (mtrx_nd != mtrx_nd_pool_) {
1339     if (fixed_hzs_ > 0) {
1340       if (mtrx_nd->step <= spl_start_[fixed_hzs_])
1341         break;
1342     }
1343 
1344     // Update the spelling segamentation information
1345     unsigned char word_splstr_len = 0;
1346     PoolPosType dmi_fr = mtrx_nd->dmi_fr;
1347     if ((PoolPosType)-1 != dmi_fr)
1348       word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
1349 
1350     while ((PoolPosType)-1 != dmi_fr) {
1351       spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
1352           (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
1353       spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
1354       spl_id_num_++;
1355       dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
1356     }
1357 
1358     // Update the lemma segmentation information
1359     lma_start_[lma_id_num_ + 1] = spl_id_num_;
1360     lma_id_[lma_id_num_] = mtrx_nd->id;
1361     lma_id_num_++;
1362 
1363     mtrx_nd = mtrx_nd->from;
1364   }
1365 
1366   // Reverse the result of spelling info
1367   for (size_t pos = fixed_hzs_;
1368        pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
1369     if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
1370       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1371       spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
1372       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1373 
1374       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
1375       spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
1376       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
1377     }
1378   }
1379 
1380   // Reverse the result of lemma info
1381   for (size_t pos = fixed_lmas_;
1382        pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
1383     assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
1384 
1385     if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
1386       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1387       lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
1388       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1389 
1390       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1391       lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
1392       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1393     }
1394   }
1395 
1396   for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
1397     if (pos < lma_id_num_)
1398       lma_start_[pos] = lma_start_[pos - 1] +
1399           (lma_start_[pos] - lma_start_[pos + 1]);
1400     else
1401       lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
1402           lma_start_[fixed_lmas_];
1403   }
1404 
1405   // Find the last fixed position
1406   fixed_hzs_ = 0;
1407   for (size_t pos = spl_id_num_; pos > 0; pos--) {
1408     if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
1409       fixed_hzs_ = pos;
1410       break;
1411     }
1412   }
1413 
1414   return;
1415 }
1416 
get_spl_start(const uint16 * & spl_start)1417 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
1418   get_spl_start_id();
1419   spl_start = spl_start_;
1420   return spl_id_num_;
1421 }
1422 
extend_dmi(DictExtPara * dep,DictMatchInfo * dmi_s)1423 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
1424   if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1425 
1426   if (dmi_c_phrase_)
1427     return extend_dmi_c(dep, dmi_s);
1428 
1429   LpiCache& lpi_cache = LpiCache::get_instance();
1430   uint16 splid = dep->splids[dep->splids_extended];
1431 
1432   bool cached = false;
1433   if (0 == dep->splids_extended)
1434     cached = lpi_cache.is_cached(splid);
1435 
1436   // 1. If this is a half Id, get its corresponding full starting Id and
1437   // number of full Id.
1438   size_t ret_val = 0;
1439   PoolPosType mtrx_dmi_fr = (PoolPosType)-1;  // From which dmi node
1440 
1441   lpi_total_ = 0;
1442 
1443   MileStoneHandle from_h[3];
1444   from_h[0] = 0;
1445   from_h[1] = 0;
1446 
1447   if (0 != dep->splids_extended) {
1448     from_h[0] = dmi_s->dict_handles[0];
1449     from_h[1] = dmi_s->dict_handles[1];
1450   }
1451 
1452   // 2. Begin exgtending in the system dictionary
1453   size_t lpi_num = 0;
1454   MileStoneHandle handles[2];
1455   handles[0] = handles[1] = 0;
1456   if (from_h[0] > 0 || NULL == dmi_s) {
1457     handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
1458                                          kMaxLmaPsbItems, &lpi_num);
1459   }
1460   if (handles[0] > 0)
1461     lpi_total_ = lpi_num;
1462 
1463   if (NULL == dmi_s) {  // from root
1464     assert(0 != handles[0]);
1465     mtrx_dmi_fr = dmi_pool_used_;
1466   }
1467 
1468   // 3. Begin extending in the user dictionary
1469   if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
1470     handles[1] = user_dict_->extend_dict(from_h[1], dep,
1471                                          lpi_items_ + lpi_total_,
1472                                          kMaxLmaPsbItems - lpi_total_,
1473                                          &lpi_num);
1474     if (handles[1] > 0) {
1475       if (kPrintDebug0) {
1476         for (size_t t = 0; t < lpi_num; t++) {
1477           printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
1478                  lpi_items_[lpi_total_ + t].psb);
1479         }
1480       }
1481       lpi_total_ += lpi_num;
1482     }
1483   }
1484 
1485   if (0 != handles[0] || 0 != handles[1]) {
1486     if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1487 
1488     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1489     if (NULL == dmi_s) {
1490       fill_dmi(dmi_add, handles,
1491                (PoolPosType)-1, splid,
1492                1, 1, dep->splid_end_split, dep->ext_len,
1493                spl_trie_->is_half_id(splid) ? 0 : 1);
1494     } else {
1495       fill_dmi(dmi_add, handles,
1496                dmi_s - dmi_pool_, splid, 1,
1497                dmi_s->dict_level + 1, dep->splid_end_split,
1498                dmi_s->splstr_len + dep->ext_len,
1499                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1500     }
1501 
1502     ret_val = 1;
1503   }
1504 
1505   if (!cached) {
1506     if (0 == lpi_total_)
1507       return ret_val;
1508 
1509     if (kPrintDebug0) {
1510       printf("--- lpi_total_ = %d\n", lpi_total_);
1511     }
1512 
1513     myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1514     if (NULL == dmi_s && spl_trie_->is_half_id(splid))
1515       lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
1516   } else {
1517     assert(spl_trie_->is_half_id(splid));
1518     lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
1519   }
1520 
1521   return ret_val;
1522 }
1523 
extend_dmi_c(DictExtPara * dep,DictMatchInfo * dmi_s)1524 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
1525   lpi_total_ = 0;
1526 
1527   uint16 pos = dep->splids_extended;
1528   assert(dmi_c_phrase_);
1529   if (pos >= c_phrase_.length)
1530     return 0;
1531 
1532   uint16 splid = dep->splids[pos];
1533   if (splid == c_phrase_.spl_ids[pos]) {
1534     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1535     MileStoneHandle handles[2];  // Actually never used.
1536     if (NULL == dmi_s)
1537       fill_dmi(dmi_add, handles,
1538                (PoolPosType)-1, splid,
1539                1, 1, dep->splid_end_split, dep->ext_len,
1540                spl_trie_->is_half_id(splid) ? 0 : 1);
1541     else
1542       fill_dmi(dmi_add, handles,
1543                dmi_s - dmi_pool_, splid, 1,
1544                dmi_s->dict_level + 1, dep->splid_end_split,
1545                dmi_s->splstr_len + dep->ext_len,
1546                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1547 
1548     if (pos == c_phrase_.length - 1) {
1549       lpi_items_[0].id = kLemmaIdComposing;
1550       lpi_items_[0].psb = 0;  // 0 is bigger than normal lemma score.
1551       lpi_total_ = 1;
1552     }
1553     return 1;
1554   }
1555   return 0;
1556 }
1557 
extend_mtrx_nd(MatrixNode * mtrx_nd,LmaPsbItem lpi_items[],size_t lpi_num,PoolPosType dmi_fr,size_t res_row)1558 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
1559                                     size_t lpi_num, PoolPosType dmi_fr,
1560                                     size_t res_row) {
1561   assert(NULL != mtrx_nd);
1562   matrix_[res_row].mtrx_nd_fixed = NULL;
1563 
1564   if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
1565     return 0;
1566 
1567   if (0 == mtrx_nd->step) {
1568     // Because the list is sorted, if the source step is 0, it is only
1569     // necessary to pick up the first kMaxNodeARow items.
1570     if (lpi_num > kMaxNodeARow)
1571       lpi_num = kMaxNodeARow;
1572   }
1573 
1574   MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
1575   for (size_t pos = 0; pos < lpi_num; pos++) {
1576     float score = mtrx_nd->score + lpi_items[pos].psb;
1577     if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
1578       break;
1579 
1580     // Try to add a new node
1581     size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
1582     MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
1583     bool replace = false;
1584     // Find its position
1585     while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
1586       if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
1587         *mtrx_nd_res = *(mtrx_nd_res - 1);
1588       mtrx_nd_res--;
1589       replace = true;
1590     }
1591     if (replace || (mtrx_nd_num < kMaxNodeARow &&
1592         matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
1593       mtrx_nd_res->id = lpi_items[pos].id;
1594       mtrx_nd_res->score = score;
1595       mtrx_nd_res->from = mtrx_nd;
1596       mtrx_nd_res->dmi_fr = dmi_fr;
1597       mtrx_nd_res->step = res_row;
1598       if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
1599         matrix_[res_row].mtrx_nd_num++;
1600     }
1601   }
1602   return matrix_[res_row].mtrx_nd_num;
1603 }
1604 
match_dmi(size_t step_to,uint16 spl_ids[],uint16 spl_id_num)1605 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
1606                                     uint16 spl_id_num) {
1607   if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
1608     return static_cast<PoolPosType>(-1);
1609   }
1610 
1611   for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
1612     DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
1613 
1614     if (dmi->dict_level != spl_id_num)
1615       continue;
1616 
1617     bool matched = true;
1618     for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
1619       if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
1620         matched = false;
1621         break;
1622       }
1623 
1624       dmi = dmi_pool_ + dmi->dmi_fr;
1625     }
1626     if (matched) {
1627       return matrix_[step_to].dmi_pos + dmi_pos;
1628     }
1629   }
1630 
1631   return static_cast<PoolPosType>(-1);
1632 }
1633 
get_candidate0(char16 * cand_str,size_t max_len,uint16 * retstr_len,bool only_unfixed)1634 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
1635                                      uint16 *retstr_len,
1636                                      bool only_unfixed) {
1637   if (pys_decoded_len_ == 0 ||
1638       matrix_[pys_decoded_len_].mtrx_nd_num == 0)
1639     return NULL;
1640 
1641   LemmaIdType idxs[kMaxRowNum];
1642   size_t id_num = 0;
1643 
1644   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1645 
1646   if (kPrintDebug0) {
1647     printf("--- sentence score: %f\n", mtrx_nd->score);
1648   }
1649 
1650   if (kPrintDebug1) {
1651     printf("==============Sentence DMI (reverse order) begin===========>>\n");
1652   }
1653 
1654   while (mtrx_nd != NULL) {
1655     idxs[id_num] = mtrx_nd->id;
1656     id_num++;
1657 
1658     if (kPrintDebug1) {
1659        printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
1660               mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
1661        debug_print_dmi(mtrx_nd->dmi_fr, 1);
1662     }
1663 
1664     mtrx_nd = mtrx_nd->from;
1665   }
1666 
1667   if (kPrintDebug1) {
1668     printf("<<==============Sentence DMI (reverse order) end=============\n");
1669   }
1670 
1671   size_t ret_pos = 0;
1672   do {
1673     id_num--;
1674     if (0 == idxs[id_num])
1675       continue;
1676 
1677     char16 str[kMaxLemmaSize + 1];
1678     uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
1679     if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
1680         (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
1681       if (!only_unfixed)
1682         utf16_strncpy(cand_str + ret_pos, str, str_len);
1683       else if (ret_pos >= fixed_hzs_)
1684         utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
1685 
1686       ret_pos += str_len;
1687     } else {
1688       return NULL;
1689     }
1690   } while (id_num != 0);
1691 
1692   if (!only_unfixed) {
1693     if (NULL != retstr_len)
1694       *retstr_len = ret_pos;
1695     cand_str[ret_pos] = (char16)'\0';
1696   } else {
1697     if (NULL != retstr_len)
1698       *retstr_len = ret_pos - fixed_hzs_;
1699     cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
1700   }
1701   return cand_str;
1702 }
1703 
get_lpis(const uint16 * splid_str,size_t splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf,const char16 * pfullsent,bool sort_by_psb)1704 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
1705                               LmaPsbItem* lma_buf, size_t max_lma_buf,
1706                               const char16 *pfullsent, bool sort_by_psb) {
1707   if (splid_str_len > kMaxLemmaSize)
1708     return 0;
1709 
1710   size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
1711                                      lma_buf, max_lma_buf);
1712   size_t num2 = 0;
1713   if (NULL != user_dict_) {
1714     num2 = user_dict_->get_lpis(splid_str, splid_str_len,
1715                          lma_buf + num1, max_lma_buf - num1);
1716   }
1717 
1718   size_t num = num1 + num2;
1719 
1720   if (0 == num)
1721     return 0;
1722 
1723   // Remove repeated items.
1724   if (splid_str_len > 1) {
1725     LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
1726     size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
1727         sizeof(LmaPsbStrItem);
1728     //assert(lpsi_num > num);
1729     if (num > lpsi_num) num = lpsi_num;
1730     lpsi_num = num;
1731 
1732     for (size_t pos = 0; pos < lpsi_num; pos++) {
1733       lpsis[pos].lpi = lma_buf[pos];
1734       get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
1735     }
1736 
1737     myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
1738 
1739     size_t remain_num = 0;
1740     for (size_t pos = 0; pos < lpsi_num; pos++) {
1741       if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
1742         if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
1743           assert(remain_num > 0);
1744           lma_buf[remain_num - 1] = lpsis[pos].lpi;
1745         }
1746         continue;
1747       }
1748       if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
1749         continue;
1750 
1751       lma_buf[remain_num] = lpsis[pos].lpi;
1752       remain_num++;
1753     }
1754 
1755     // Update the result number
1756     num = remain_num;
1757   } else {
1758     // For single character, some characters have more than one spelling, for
1759     // example, "de" and "di" are all valid for a Chinese character, so when
1760     // the user input  "d", repeated items are generated.
1761     // For single character lemmas, Hanzis will be gotten
1762     for (size_t pos = 0; pos < num; pos++) {
1763       char16 hanzis[2];
1764       get_lemma_str(lma_buf[pos].id, hanzis, 2);
1765       lma_buf[pos].hanzi = hanzis[0];
1766     }
1767 
1768     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
1769 
1770     size_t remain_num = 0;
1771     for (size_t pos = 0; pos < num; pos++) {
1772       if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
1773         if (NULL != pfullsent &&
1774             static_cast<char16>(0) == pfullsent[1] &&
1775             lma_buf[pos].hanzi == pfullsent[0])
1776           continue;
1777 
1778         if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
1779           assert(remain_num > 0);
1780           assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
1781           lma_buf[remain_num - 1] = lma_buf[pos];
1782         }
1783         continue;
1784       }
1785       if (NULL != pfullsent &&
1786           static_cast<char16>(0) == pfullsent[1] &&
1787           lma_buf[pos].hanzi == pfullsent[0])
1788           continue;
1789 
1790       lma_buf[remain_num] = lma_buf[pos];
1791       remain_num++;
1792     }
1793 
1794     num = remain_num;
1795   }
1796 
1797   if (sort_by_psb) {
1798     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1799   }
1800   return num;
1801 }
1802 
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)1803 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
1804                                    uint16 str_max) {
1805   uint16 str_len = 0;
1806 
1807   if (is_system_lemma(id_lemma)) {
1808     str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
1809   } else if (is_user_lemma(id_lemma)) {
1810     if (NULL != user_dict_) {
1811       str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
1812     } else {
1813       str_len = 0;
1814       str_buf[0] = static_cast<char16>('\0');
1815     }
1816   } else if (is_composing_lemma(id_lemma)) {
1817     if (str_max <= 1)
1818       return 0;
1819     str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
1820     if (str_len > str_max - 1)
1821       str_len = str_max - 1;
1822     utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
1823     str_buf[str_len] = (char16)'\0';
1824     return str_len;
1825   }
1826 
1827   return str_len;
1828 }
1829 
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)1830 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
1831                                       uint16 splids_max, bool arg_valid) {
1832   uint16 splid_num = 0;
1833 
1834   if (arg_valid) {
1835     for (splid_num = 0; splid_num < splids_max; splid_num++) {
1836       if (spl_trie_->is_half_id(splids[splid_num]))
1837         break;
1838     }
1839     if (splid_num == splids_max)
1840       return splid_num;
1841   }
1842 
1843   if (is_system_lemma(id_lemma)) {
1844     splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
1845                                               arg_valid);
1846   } else if (is_user_lemma(id_lemma)) {
1847     if (NULL != user_dict_) {
1848       splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
1849                                                arg_valid);
1850     } else {
1851       splid_num = 0;
1852     }
1853   } else if (is_composing_lemma(id_lemma)) {
1854     if (c_phrase_.length > splids_max) {
1855       return 0;
1856     }
1857     for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
1858       splids[pos] = c_phrase_.spl_ids[pos];
1859       if (spl_trie_->is_half_id(splids[pos])) {
1860         return 0;
1861       }
1862     }
1863   }
1864   return splid_num;
1865 }
1866 
inner_predict(const char16 * fixed_buf,uint16 fixed_len,char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1867 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
1868                                    char16 predict_buf[][kMaxPredictSize + 1],
1869                                    size_t buf_len) {
1870   size_t res_total = 0;
1871   memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
1872   // In order to shorten the comments, j-character candidates predicted by
1873   // i-character prefix are called P(i,j). All candiates predicted by
1874   // i-character prefix are called P(i,*)
1875   // Step 1. Get P(kMaxPredictSize, *) and sort them, here
1876   // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
1877   for (size_t len = fixed_len; len >0; len--) {
1878     // How many blank items are available
1879     size_t this_max = npre_items_len_ - res_total;
1880     size_t res_this;
1881     // If the history is longer than 1, and we can not get prediction from
1882     // lemmas longer than 2, in this case, we will add lemmas with
1883     // highest scores as the prediction result.
1884     if (fixed_len > 1 && 1 == len && 0 == res_total) {
1885       // Try to find if recent n (n>1) characters can be a valid lemma in system
1886       // dictionary.
1887       bool nearest_n_word = false;
1888       for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
1889         if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
1890           nearest_n_word = true;
1891           break;
1892         }
1893       }
1894       res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
1895                                               npre_items_ + res_total,
1896                                               this_max, res_total);
1897       res_total += res_this;
1898     }
1899 
1900     // How many blank items are available
1901     this_max = npre_items_len_ - res_total;
1902     res_this = 0;
1903     if (!kOnlyUserDictPredict) {
1904       res_this =
1905           dict_trie_->predict(fixed_buf + fixed_len - len, len,
1906                               npre_items_ + res_total, this_max,
1907                               res_total);
1908     }
1909 
1910     if (NULL != user_dict_) {
1911       res_this = res_this +
1912                  user_dict_->predict(fixed_buf + fixed_len - len, len,
1913                                      npre_items_ + res_total + res_this,
1914                                      this_max - res_this, res_total + res_this);
1915     }
1916 
1917     if (kPredictLimitGt1) {
1918       myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
1919               cmp_npre_by_score);
1920 
1921       if (len > 3) {
1922         if (res_this > kMaxPredictNumByGt3)
1923           res_this = kMaxPredictNumByGt3;
1924       } else if (3 == len) {
1925         if (res_this > kMaxPredictNumBy3)
1926           res_this = kMaxPredictNumBy3;
1927       } else if (2 == len) {
1928         if (res_this > kMaxPredictNumBy2)
1929           res_this = kMaxPredictNumBy2;
1930       }
1931     }
1932 
1933     res_total += res_this;
1934   }
1935 
1936   res_total = remove_duplicate_npre(npre_items_, res_total);
1937 
1938   if (kPreferLongHistoryPredict) {
1939     myqsort(npre_items_, res_total, sizeof(NPredictItem),
1940             cmp_npre_by_hislen_score);
1941   } else {
1942     myqsort(npre_items_, res_total, sizeof(NPredictItem),
1943             cmp_npre_by_score);
1944   }
1945 
1946   if (buf_len < res_total) {
1947     res_total = buf_len;
1948   }
1949 
1950   if (kPrintDebug2) {
1951     printf("/////////////////Predicted Items Begin////////////////////>>\n");
1952     for (size_t i = 0; i < res_total; i++) {
1953       printf("---");
1954       for (size_t j = 0; j < kMaxPredictSize; j++) {
1955         printf("%d  ", npre_items_[i].pre_hzs[j]);
1956       }
1957       printf("\n");
1958     }
1959     printf("<<///////////////Predicted Items End////////////////////////\n");
1960   }
1961 
1962   for (size_t i = 0; i < res_total; i++) {
1963     utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
1964                   kMaxPredictSize);
1965     predict_buf[i][kMaxPredictSize] = '\0';
1966   }
1967 
1968   return res_total;
1969 }
1970 
get_predicts(const char16 fixed_buf[],char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1971 size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
1972                                   char16 predict_buf[][kMaxPredictSize + 1],
1973                                   size_t buf_len) {
1974   size_t fixed_len = utf16_strlen(fixed_buf);
1975   if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
1976     return 0;
1977 
1978   return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
1979 }
1980 
1981 }  // namespace ime_pinyin
1982