xref: /OK3568_Linux_fs/app/forlinx/flapp/src/keyboard/pinyin/share/dicttrie.cpp (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include "../include/dicttrie.h"
21 #include "../include/dictbuilder.h"
22 #include "../include/lpicache.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25 
26 namespace ime_pinyin {
27 
DictTrie()28 DictTrie::DictTrie() {
29   spl_trie_ = SpellingTrie::get_cpinstance();
30 
31   root_ = NULL;
32   splid_le0_index_ = NULL;
33   lma_node_num_le0_ = 0;
34   nodes_ge1_ = NULL;
35   lma_node_num_ge1_ = 0;
36   lma_idx_buf_ = NULL;
37   lma_idx_buf_len_ = 0;
38   total_lma_num_ = 0;
39   top_lmas_num_ = 0;
40   dict_list_ = NULL;
41 
42   parsing_marks_ = NULL;
43   mile_stones_ = NULL;
44   reset_milestones(0, kFirstValidMileStoneHandle);
45 }
46 
~DictTrie()47 DictTrie::~DictTrie() {
48   free_resource(true);
49 }
50 
free_resource(bool free_dict_list)51 void DictTrie::free_resource(bool free_dict_list) {
52   if (NULL != root_)
53     free(root_);
54   root_ = NULL;
55 
56   if (NULL != splid_le0_index_)
57     free(splid_le0_index_);
58   splid_le0_index_ = NULL;
59 
60   if (NULL != nodes_ge1_)
61     free(nodes_ge1_);
62   nodes_ge1_ = NULL;
63 
64   if (NULL != lma_idx_buf_)
65     free(lma_idx_buf_);
66   lma_idx_buf_ = NULL;
67 
68   if (free_dict_list) {
69     if (NULL != dict_list_) {
70       delete dict_list_;
71     }
72     dict_list_ = NULL;
73   }
74 
75   if (parsing_marks_)
76     delete [] parsing_marks_;
77   parsing_marks_ = NULL;
78 
79   if (mile_stones_)
80     delete [] mile_stones_;
81   mile_stones_ = NULL;
82 
83   reset_milestones(0, kFirstValidMileStoneHandle);
84 }
85 
get_son_offset(const LmaNodeGE1 * node)86 inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) {
87   return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16));
88 }
89 
get_homo_idx_buf_offset(const LmaNodeGE1 * node)90 inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) {
91   return ((size_t)node->homo_idx_buf_off_l +
92           ((size_t)node->homo_idx_buf_off_h << 16));
93 }
94 
get_lemma_id(size_t id_offset)95 inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) {
96   LemmaIdType id = 0;
97   for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--)
98     id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos];
99   id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize];
100   return id;
101 }
102 
103 #ifdef ___BUILD_MODEL___
build_dict(const char * fn_raw,const char * fn_validhzs)104 bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) {
105   DictBuilder* dict_builder = new DictBuilder();
106 
107   free_resource(true);
108 
109   return dict_builder->build_dict(fn_raw, fn_validhzs, this);
110 }
111 
save_dict(FILE * fp)112 bool DictTrie::save_dict(FILE *fp) {
113   if (NULL == fp)
114     return false;
115 
116   if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
117     return false;
118 
119   if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
120     return false;
121 
122   if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
123     return false;
124 
125   if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1)
126     return false;
127 
128   if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
129       != lma_node_num_le0_)
130     return false;
131 
132   if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
133       != lma_node_num_ge1_)
134     return false;
135 
136   if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
137       lma_idx_buf_len_)
138     return false;
139 
140   return true;
141 }
142 
save_dict(const char * filename)143 bool DictTrie::save_dict(const char *filename) {
144   if (NULL == filename)
145     return false;
146 
147   if (NULL == root_ || NULL == dict_list_)
148     return false;
149 
150   SpellingTrie &spl_trie = SpellingTrie::get_instance();
151   NGram &ngram = NGram::get_instance();
152 
153   FILE *fp = fopen(filename, "wb");
154   if (NULL == fp)
155     return false;
156 
157   if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) ||
158       !save_dict(fp) || !ngram.save_ngram(fp)) {
159     fclose(fp);
160     return false;
161   }
162 
163   fclose(fp);
164   return true;
165 }
166 #endif  // ___BUILD_MODEL___
167 
load_dict(FILE * fp)168 bool DictTrie::load_dict(FILE *fp) {
169   if (NULL == fp)
170     return false;
171   if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
172     return false;
173 
174   if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
175     return false;
176 
177   if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
178     return false;
179 
180   if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 ||
181      top_lmas_num_ >= lma_idx_buf_len_)
182      return false;
183 
184   free_resource(false);
185 
186   root_ = static_cast<LmaNodeLE0*>
187           (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0)));
188   nodes_ge1_ = static_cast<LmaNodeGE1*>
189                (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1)));
190   lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_);
191   total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize;
192 
193   size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1;
194   assert(lma_node_num_le0_ <= buf_size);
195   splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16)));
196 
197   // Init the space for parsing.
198   parsing_marks_ = new ParsingMark[kMaxParsingMark];
199   mile_stones_ = new MileStone[kMaxMileStone];
200   reset_milestones(0, kFirstValidMileStoneHandle);
201 
202   if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ ||
203       NULL == splid_le0_index_ || NULL == parsing_marks_ ||
204       NULL == mile_stones_) {
205     free_resource(false);
206     return false;
207   }
208 
209   if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
210       != lma_node_num_le0_)
211     return false;
212 
213   if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
214       != lma_node_num_ge1_)
215     return false;
216 
217   if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
218       lma_idx_buf_len_)
219     return false;
220 
221   // The quick index for the first level sons
222   uint16 last_splid = kFullSplIdStart;
223   size_t last_pos = 0;
224   for (size_t i = 1; i < lma_node_num_le0_; i++) {
225     for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++)
226       splid_le0_index_[splid - kFullSplIdStart] = last_pos;
227 
228     splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] =
229         static_cast<uint16>(i);
230     last_splid = root_[i].spl_idx;
231     last_pos = i;
232   }
233 
234   for (uint16 splid = last_splid + 1;
235        splid < buf_size + kFullSplIdStart; splid++) {
236     assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size);
237     splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1;
238   }
239 
240   return true;
241 }
242 
load_dict(const char * filename,LemmaIdType start_id,LemmaIdType end_id)243 bool DictTrie::load_dict(const char *filename, LemmaIdType start_id,
244                          LemmaIdType end_id) {
245   if (NULL == filename || end_id <= start_id)
246     return false;
247 
248   FILE *fp = fopen(filename, "rb");
249   if (NULL == fp)
250     return false;
251 
252   free_resource(true);
253 
254   dict_list_ = new DictList();
255   if (NULL == dict_list_) {
256     fclose(fp);
257     return false;
258   }
259 
260   SpellingTrie &spl_trie = SpellingTrie::get_instance();
261   NGram &ngram = NGram::get_instance();
262 
263   if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
264       !load_dict(fp) || !ngram.load_ngram(fp) ||
265       total_lma_num_ > end_id - start_id + 1) {
266     free_resource(true);
267     fclose(fp);
268     return false;
269   }
270 
271   fclose(fp);
272   return true;
273 }
274 
load_dict_fd(int sys_fd,long start_offset,long length,LemmaIdType start_id,LemmaIdType end_id)275 bool DictTrie::load_dict_fd(int sys_fd, long start_offset,
276                             long length, LemmaIdType start_id,
277                             LemmaIdType end_id) {
278   if (start_offset < 0 || length <= 0 || end_id <= start_id)
279     return false;
280 
281   FILE *fp = fdopen(sys_fd, "rb");
282   if (NULL == fp)
283     return false;
284 
285   if (-1 == fseek(fp, start_offset, SEEK_SET)) {
286     fclose(fp);
287     return false;
288   }
289 
290   free_resource(true);
291 
292   dict_list_ = new DictList();
293   if (NULL == dict_list_) {
294     fclose(fp);
295     return false;
296   }
297 
298   SpellingTrie &spl_trie = SpellingTrie::get_instance();
299   NGram &ngram = NGram::get_instance();
300 
301   if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
302       !load_dict(fp) || !ngram.load_ngram(fp) ||
303       ftell(fp) < start_offset + length ||
304       total_lma_num_ > end_id - start_id + 1) {
305     free_resource(true);
306     fclose(fp);
307     return false;
308   }
309 
310   fclose(fp);
311   return true;
312 }
313 
fill_lpi_buffer(LmaPsbItem lpi_items[],size_t lpi_max,LmaNodeLE0 * node)314 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
315                                  LmaNodeLE0 *node) {
316   size_t lpi_num = 0;
317   NGram& ngram = NGram::get_instance();
318   for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
319     lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off +
320                                          homo);
321     lpi_items[lpi_num].lma_len = 1;
322     lpi_items[lpi_num].psb =
323         static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
324     lpi_num++;
325     if (lpi_num >= lpi_max)
326       break;
327   }
328 
329   return lpi_num;
330 }
331 
fill_lpi_buffer(LmaPsbItem lpi_items[],size_t lpi_max,size_t homo_buf_off,LmaNodeGE1 * node,uint16 lma_len)332 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
333                                  size_t homo_buf_off, LmaNodeGE1 *node,
334                                  uint16 lma_len) {
335   size_t lpi_num = 0;
336   NGram& ngram = NGram::get_instance();
337   for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
338     lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo);
339     lpi_items[lpi_num].lma_len = lma_len;
340     lpi_items[lpi_num].psb =
341         static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
342     lpi_num++;
343     if (lpi_num >= lpi_max)
344       break;
345   }
346 
347   return lpi_num;
348 }
349 
reset_milestones(uint16 from_step,MileStoneHandle from_handle)350 void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
351   if (0 == from_step) {
352     parsing_marks_pos_ = 0;
353     mile_stones_pos_ = kFirstValidMileStoneHandle;
354   } else {
355     if (from_handle > 0 && from_handle < mile_stones_pos_) {
356       mile_stones_pos_ = from_handle;
357 
358       MileStone *mile_stone = mile_stones_ + from_handle;
359       parsing_marks_pos_ = mile_stone->mark_start;
360     }
361   }
362 }
363 
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)364 MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle,
365                                       const DictExtPara *dep,
366                                       LmaPsbItem *lpi_items, size_t lpi_max,
367                                       size_t *lpi_num) {
368   if (NULL == dep)
369     return 0;
370 
371   // from LmaNodeLE0 (root) to LmaNodeLE0
372   if (0 == from_handle) {
373     assert(0 == dep->splids_extended);
374     return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num);
375   }
376 
377   // from LmaNodeLE0 to LmaNodeGE1
378   if (1 == dep->splids_extended)
379     return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num);
380 
381   // From LmaNodeGE1 to LmaNodeGE1
382   return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num);
383 }
384 
extend_dict0(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)385 MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle,
386                                        const DictExtPara *dep,
387                                        LmaPsbItem *lpi_items,
388                                        size_t lpi_max, size_t *lpi_num) {
389   assert(NULL != dep && 0 == from_handle);
390   *lpi_num = 0;
391   MileStoneHandle ret_handle = 0;
392 
393   uint16 splid = dep->splids[dep->splids_extended];
394   uint16 id_start = dep->id_start;
395   uint16 id_num = dep->id_num;
396 
397   LpiCache& lpi_cache = LpiCache::get_instance();
398   bool cached = lpi_cache.is_cached(splid);
399 
400   // 2. Begin exgtending
401   // 2.1 Get the LmaPsbItem list
402   LmaNodeLE0 *node = root_;
403   size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
404   size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart];
405   for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
406     assert(1 == node->son_1st_off);
407     LmaNodeLE0 *son = root_ + son_pos;
408     assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num);
409 
410     if (!cached && *lpi_num < lpi_max) {
411       bool need_lpi = true;
412       if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start)
413         need_lpi = false;
414 
415       if (need_lpi)
416         *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
417                                     lpi_max - *lpi_num, son);
418     }
419 
420     // If necessary, fill in a new mile stone.
421     if (son->spl_idx == id_start) {
422       if (mile_stones_pos_ < kMaxMileStone &&
423           parsing_marks_pos_ < kMaxParsingMark) {
424         parsing_marks_[parsing_marks_pos_].node_offset = son_pos;
425         parsing_marks_[parsing_marks_pos_].node_num = id_num;
426         mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_;
427         mile_stones_[mile_stones_pos_].mark_num = 1;
428         ret_handle = mile_stones_pos_;
429         parsing_marks_pos_++;
430         mile_stones_pos_++;
431       }
432     }
433 
434     if (son->spl_idx >= id_start + id_num -1)
435       break;
436   }
437 
438   //  printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
439   //      mile_stones_pos_);
440   return ret_handle;
441 }
442 
extend_dict1(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)443 MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle,
444                                        const DictExtPara *dep,
445                                        LmaPsbItem *lpi_items,
446                                        size_t lpi_max, size_t *lpi_num) {
447   assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
448 
449   MileStoneHandle ret_handle = 0;
450 
451   // 1. If this is a half Id, get its corresponding full starting Id and
452   // number of full Id.
453   size_t ret_val = 0;
454 
455   uint16 id_start = dep->id_start;
456   uint16 id_num = dep->id_num;
457 
458   // 2. Begin extending.
459   MileStone *mile_stone = mile_stones_ + from_handle;
460 
461   for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
462     ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
463     uint16 ext_num = p_mark.node_num;
464     for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
465       LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos;
466       size_t found_start = 0;
467       size_t found_num = 0;
468       for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
469         assert(node->son_1st_off <= lma_node_num_ge1_);
470         LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos;
471         if (son->spl_idx >= id_start
472             && son->spl_idx < id_start + id_num) {
473           if (*lpi_num < lpi_max) {
474             size_t homo_buf_off = get_homo_idx_buf_offset(son);
475             *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
476                                         lpi_max - *lpi_num, homo_buf_off, son,
477                                         2);
478           }
479 
480           // If necessary, fill in the new DTMI
481           if (0 == found_num) {
482             found_start = son_pos;
483           }
484           found_num++;
485         }
486         if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
487             (size_t)node->num_of_son - 1) {
488           if (found_num > 0) {
489             if (mile_stones_pos_ < kMaxMileStone &&
490                 parsing_marks_pos_ < kMaxParsingMark) {
491               parsing_marks_[parsing_marks_pos_].node_offset =
492                 node->son_1st_off + found_start;
493               parsing_marks_[parsing_marks_pos_].node_num = found_num;
494               if (0 == ret_val)
495                 mile_stones_[mile_stones_pos_].mark_start =
496                   parsing_marks_pos_;
497               parsing_marks_pos_++;
498             }
499 
500             ret_val++;
501           }
502           break;
503         }  // for son_pos
504       }  // for ext_pos
505     }  // for h_pos
506   }
507 
508   if (ret_val > 0) {
509     mile_stones_[mile_stones_pos_].mark_num = ret_val;
510     ret_handle = mile_stones_pos_;
511     mile_stones_pos_++;
512     ret_val = 1;
513   }
514 
515   //  printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
516   //         mile_stones_pos_);
517   return ret_handle;
518 }
519 
extend_dict2(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)520 MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle,
521                                        const DictExtPara *dep,
522                                        LmaPsbItem *lpi_items,
523                                        size_t lpi_max, size_t *lpi_num) {
524   assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
525 
526   MileStoneHandle ret_handle = 0;
527 
528   // 1. If this is a half Id, get its corresponding full starting Id and
529   // number of full Id.
530   size_t ret_val = 0;
531 
532   uint16 id_start = dep->id_start;
533   uint16 id_num = dep->id_num;
534 
535   // 2. Begin extending.
536   MileStone *mile_stone = mile_stones_ + from_handle;
537 
538   for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
539     ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
540     uint16 ext_num = p_mark.node_num;
541     for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
542       LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos;
543       size_t found_start = 0;
544       size_t found_num = 0;
545 
546       for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
547         assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
548         LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos;
549         if (son->spl_idx >= id_start
550             && son->spl_idx < id_start + id_num) {
551           if (*lpi_num < lpi_max) {
552             size_t homo_buf_off = get_homo_idx_buf_offset(son);
553             *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
554                                         lpi_max - *lpi_num, homo_buf_off, son,
555                                         dep->splids_extended + 1);
556           }
557 
558           // If necessary, fill in the new DTMI
559           if (0 == found_num) {
560             found_start = son_pos;
561           }
562           found_num++;
563         }
564         if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
565             (size_t)node->num_of_son - 1) {
566           if (found_num > 0) {
567             if (mile_stones_pos_ < kMaxMileStone &&
568                 parsing_marks_pos_ < kMaxParsingMark) {
569               parsing_marks_[parsing_marks_pos_].node_offset =
570                 get_son_offset(node) + found_start;
571               parsing_marks_[parsing_marks_pos_].node_num = found_num;
572               if (0 == ret_val)
573                 mile_stones_[mile_stones_pos_].mark_start =
574                   parsing_marks_pos_;
575               parsing_marks_pos_++;
576             }
577 
578             ret_val++;
579           }
580           break;
581         }
582       }  // for son_pos
583     }  // for ext_pos
584   }  // for h_pos
585 
586   if (ret_val > 0) {
587     mile_stones_[mile_stones_pos_].mark_num = ret_val;
588     ret_handle = mile_stones_pos_;
589     mile_stones_pos_++;
590   }
591 
592   // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
593   //        mile_stones_pos_);
594   return ret_handle;
595 }
596 
try_extend(const uint16 * splids,uint16 splid_num,LemmaIdType id_lemma)597 bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num,
598                           LemmaIdType id_lemma) {
599   if (0 == splid_num || NULL == splids)
600     return false;
601 
602   void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart];
603 
604   for (uint16 pos = 1; pos < splid_num; pos++) {
605     if (1 == pos) {
606       LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
607       LmaNodeGE1 *node_son;
608       uint16 son_pos;
609       for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son);
610            son_pos++) {
611         assert(node_le0->son_1st_off <= lma_node_num_ge1_);
612         node_son = nodes_ge1_ + node_le0->son_1st_off
613             + son_pos;
614         if (node_son->spl_idx == splids[pos])
615           break;
616       }
617       if (son_pos < node_le0->num_of_son)
618         node = reinterpret_cast<void*>(node_son);
619       else
620         return false;
621     } else {
622       LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
623       LmaNodeGE1 *node_son;
624       uint16 son_pos;
625       for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son);
626            son_pos++) {
627         assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0);
628         node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos;
629         if (node_son->spl_idx == splids[pos])
630           break;
631       }
632       if (son_pos < node_ge1->num_of_son)
633         node = reinterpret_cast<void*>(node_son);
634       else
635         return false;
636     }
637   }
638 
639   if (1 == splid_num) {
640     LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
641     size_t num_of_homo = (size_t)node_le0->num_of_homo;
642     for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
643       LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
644       char16 str[2];
645       get_lemma_str(id_this, str, 2);
646       if (id_this == id_lemma)
647         return true;
648     }
649   } else {
650     LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
651     size_t num_of_homo = (size_t)node_ge1->num_of_homo;
652     for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
653       size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
654       if (get_lemma_id(node_homo_off + homo_pos) == id_lemma)
655         return true;
656     }
657   }
658 
659   return false;
660 }
661 
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf)662 size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len,
663                           LmaPsbItem* lma_buf, size_t max_lma_buf) {
664   if (splid_str_len > kMaxLemmaSize)
665     return 0;
666 
667 #define MAX_EXTENDBUF_LEN 200
668 
669   size_t* node_buf1[MAX_EXTENDBUF_LEN];  // use size_t for data alignment
670   size_t* node_buf2[MAX_EXTENDBUF_LEN];
671   LmaNodeLE0** node_fr_le0 =
672     reinterpret_cast<LmaNodeLE0**>(node_buf1);      // Nodes from.
673   LmaNodeLE0** node_to_le0 =
674     reinterpret_cast<LmaNodeLE0**>(node_buf2);      // Nodes to.
675   LmaNodeGE1** node_fr_ge1 = NULL;
676   LmaNodeGE1** node_to_ge1 = NULL;
677   size_t node_fr_num = 1;
678   size_t node_to_num = 0;
679   node_fr_le0[0] = root_;
680   if (NULL == node_fr_le0[0])
681     return 0;
682 
683   size_t spl_pos = 0;
684 
685   while (spl_pos < splid_str_len) {
686     uint16 id_num = 1;
687     uint16 id_start = splid_str[spl_pos];
688     // If it is a half id
689     if (spl_trie_->is_half_id(splid_str[spl_pos])) {
690       id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start);
691       assert(id_num > 0);
692     }
693 
694     // Extend the nodes
695     if (0 == spl_pos) {  // From LmaNodeLE0 (root) to LmaNodeLE0 nodes
696       for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
697         LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
698         assert(node == root_ && 1 == node_fr_num);
699         size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
700         size_t son_end =
701             splid_le0_index_[id_start + id_num - kFullSplIdStart];
702         for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
703           assert(1 == node->son_1st_off);
704           LmaNodeLE0 *node_son = root_ + son_pos;
705           assert(node_son->spl_idx >= id_start
706                  && node_son->spl_idx < id_start + id_num);
707           if (node_to_num < MAX_EXTENDBUF_LEN) {
708             node_to_le0[node_to_num] = node_son;
709             node_to_num++;
710           }
711           // id_start + id_num - 1 is the last one, which has just been
712           // recorded.
713           if (node_son->spl_idx >= id_start + id_num - 1)
714             break;
715         }
716       }
717 
718       spl_pos++;
719       if (spl_pos >= splid_str_len || node_to_num == 0)
720         break;
721       // Prepare the nodes for next extending
722       // next time, from LmaNodeLE0 to LmaNodeGE1
723       LmaNodeLE0** node_tmp = node_fr_le0;
724       node_fr_le0 = node_to_le0;
725       node_to_le0 = NULL;
726       node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp);
727     } else if (1 == spl_pos) {  // From LmaNodeLE0 to LmaNodeGE1 nodes
728       for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
729         LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
730         for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
731              son_pos++) {
732           assert(node->son_1st_off <= lma_node_num_ge1_);
733           LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off
734                                   + son_pos;
735           if (node_son->spl_idx >= id_start
736               && node_son->spl_idx < id_start + id_num) {
737             if (node_to_num < MAX_EXTENDBUF_LEN) {
738               node_to_ge1[node_to_num] = node_son;
739               node_to_num++;
740             }
741           }
742           // id_start + id_num - 1 is the last one, which has just been
743           // recorded.
744           if (node_son->spl_idx >= id_start + id_num - 1)
745             break;
746         }
747       }
748 
749       spl_pos++;
750       if (spl_pos >= splid_str_len || node_to_num == 0)
751         break;
752       // Prepare the nodes for next extending
753       // next time, from LmaNodeGE1 to LmaNodeGE1
754       node_fr_ge1 = node_to_ge1;
755       node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0);
756       node_fr_le0 = NULL;
757       node_to_le0 = NULL;
758     } else {  // From LmaNodeGE1 to LmaNodeGE1 nodes
759       for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
760         LmaNodeGE1 *node = node_fr_ge1[node_fr_pos];
761         for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
762              son_pos++) {
763           assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
764           LmaNodeGE1 *node_son = nodes_ge1_
765                                   + get_son_offset(node) + son_pos;
766           if (node_son->spl_idx >= id_start
767               && node_son->spl_idx < id_start + id_num) {
768             if (node_to_num < MAX_EXTENDBUF_LEN) {
769               node_to_ge1[node_to_num] = node_son;
770               node_to_num++;
771             }
772           }
773           // id_start + id_num - 1 is the last one, which has just been
774           // recorded.
775           if (node_son->spl_idx >= id_start + id_num - 1)
776             break;
777         }
778       }
779 
780       spl_pos++;
781       if (spl_pos >= splid_str_len || node_to_num == 0)
782         break;
783       // Prepare the nodes for next extending
784       // next time, from LmaNodeGE1 to LmaNodeGE1
785       LmaNodeGE1 **node_tmp = node_fr_ge1;
786       node_fr_ge1 = node_to_ge1;
787       node_to_ge1 = node_tmp;
788     }
789 
790     // The number of node for next extending
791     node_fr_num = node_to_num;
792     node_to_num = 0;
793   }  // while
794 
795   if (0 == node_to_num)
796     return 0;
797 
798   NGram &ngram = NGram::get_instance();
799   size_t lma_num = 0;
800 
801   // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e',
802   // only those candidates for the full matched one-char id will be returned.
803   if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0]))
804     node_to_num = node_to_num > 0 ? 1 : 0;
805 
806   for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) {
807     size_t num_of_homo = 0;
808     if (spl_pos <= 1) {  // Get from LmaNodeLE0 nodes
809       LmaNodeLE0* node_le0 = node_to_le0[node_pos];
810       num_of_homo = (size_t)node_le0->num_of_homo;
811       for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
812         size_t ch_pos = lma_num + homo_pos;
813         lma_buf[ch_pos].id =
814             get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
815         lma_buf[ch_pos].lma_len = 1;
816         lma_buf[ch_pos].psb =
817             static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
818 
819         if (lma_num + homo_pos >= max_lma_buf - 1)
820           break;
821       }
822     } else {  // Get from LmaNodeGE1 nodes
823       LmaNodeGE1* node_ge1 = node_to_ge1[node_pos];
824       num_of_homo = (size_t)node_ge1->num_of_homo;
825       for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
826         size_t ch_pos = lma_num + homo_pos;
827         size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
828         lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos);
829         lma_buf[ch_pos].lma_len = splid_str_len;
830         lma_buf[ch_pos].psb =
831             static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
832 
833         if (lma_num + homo_pos >= max_lma_buf - 1)
834           break;
835       }
836     }
837 
838     lma_num += num_of_homo;
839     if (lma_num >= max_lma_buf) {
840       lma_num = max_lma_buf;
841       break;
842     }
843   }
844   return lma_num;
845 }
846 
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)847 uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
848                                uint16 str_max) {
849   return dict_list_->get_lemma_str(id_lemma, str_buf, str_max);
850 }
851 
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)852 uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
853                                   uint16 splids_max, bool arg_valid) {
854   char16 lma_str[kMaxLemmaSize + 1];
855   uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1);
856   assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max);
857 
858   uint16 spl_mtrx[kMaxLemmaSize * 5];
859   uint16 spl_start[kMaxLemmaSize + 1];
860   spl_start[0] = 0;
861   uint16 try_num = 1;
862 
863   for (uint16 pos = 0; pos < lma_len; pos++) {
864     uint16 cand_splids_this = 0;
865     if (arg_valid && spl_trie_->is_full_id(splids[pos])) {
866       spl_mtrx[spl_start[pos]] = splids[pos];
867       cand_splids_this = 1;
868     } else {
869       cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos],
870           arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos],
871           kMaxLemmaSize * 5 - spl_start[pos]);
872       assert(cand_splids_this > 0);
873     }
874     spl_start[pos + 1] = spl_start[pos] + cand_splids_this;
875     try_num *= cand_splids_this;
876   }
877 
878   for (uint16 try_pos = 0; try_pos < try_num; try_pos++) {
879     uint16 mod = 1;
880     for (uint16 pos = 0; pos < lma_len; pos++) {
881       uint16 radix = spl_start[pos + 1] - spl_start[pos];
882       splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix];
883       mod *= radix;
884     }
885 
886     if (try_extend(splids, lma_len, id_lemma))
887       return lma_len;
888   }
889 
890   return 0;
891 }
892 
set_total_lemma_count_of_others(size_t count)893 void DictTrie::set_total_lemma_count_of_others(size_t count) {
894   NGram& ngram = NGram::get_instance();
895   ngram.set_total_freq_none_sys(count);
896 }
897 
convert_to_hanzis(char16 * str,uint16 str_len)898 void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) {
899   return dict_list_->convert_to_hanzis(str, str_len);
900 }
901 
convert_to_scis_ids(char16 * str,uint16 str_len)902 void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) {
903   return dict_list_->convert_to_scis_ids(str, str_len);
904 }
905 
get_lemma_id(const char16 lemma_str[],uint16 lemma_len)906 LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) {
907   if (NULL == lemma_str || lemma_len > kMaxLemmaSize)
908     return 0;
909 
910   return dict_list_->get_lemma_id(lemma_str, lemma_len);
911 }
912 
predict_top_lmas(size_t his_len,NPredictItem * npre_items,size_t npre_max,size_t b4_used)913 size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items,
914                                   size_t npre_max, size_t b4_used) {
915   NGram &ngram = NGram::get_instance();
916 
917   size_t item_num = 0;
918   size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_;
919   size_t top_lmas_pos = 0;
920   while (item_num < npre_max && top_lmas_pos < top_lmas_num_) {
921     memset(npre_items + item_num, 0, sizeof(NPredictItem));
922     LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos);
923     top_lmas_pos += 1;
924     if (dict_list_->get_lemma_str(top_lma_id,
925                                   npre_items[item_num].pre_hzs,
926                                   kMaxLemmaSize - 1) == 0) {
927       continue;
928     }
929     npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id);
930     npre_items[item_num].his_len = his_len;
931     item_num++;
932   }
933   return item_num;
934 }
935 
predict(const char16 * last_hzs,uint16 hzs_len,NPredictItem * npre_items,size_t npre_max,size_t b4_used)936 size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len,
937                          NPredictItem *npre_items, size_t npre_max,
938                          size_t b4_used) {
939   return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used);
940 }
941 }  // namespace ime_pinyin
942