1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include "../include/dicttrie.h"
21 #include "../include/dictbuilder.h"
22 #include "../include/lpicache.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25
26 namespace ime_pinyin {
27
DictTrie()28 DictTrie::DictTrie() {
29 spl_trie_ = SpellingTrie::get_cpinstance();
30
31 root_ = NULL;
32 splid_le0_index_ = NULL;
33 lma_node_num_le0_ = 0;
34 nodes_ge1_ = NULL;
35 lma_node_num_ge1_ = 0;
36 lma_idx_buf_ = NULL;
37 lma_idx_buf_len_ = 0;
38 total_lma_num_ = 0;
39 top_lmas_num_ = 0;
40 dict_list_ = NULL;
41
42 parsing_marks_ = NULL;
43 mile_stones_ = NULL;
44 reset_milestones(0, kFirstValidMileStoneHandle);
45 }
46
~DictTrie()47 DictTrie::~DictTrie() {
48 free_resource(true);
49 }
50
free_resource(bool free_dict_list)51 void DictTrie::free_resource(bool free_dict_list) {
52 if (NULL != root_)
53 free(root_);
54 root_ = NULL;
55
56 if (NULL != splid_le0_index_)
57 free(splid_le0_index_);
58 splid_le0_index_ = NULL;
59
60 if (NULL != nodes_ge1_)
61 free(nodes_ge1_);
62 nodes_ge1_ = NULL;
63
64 if (NULL != lma_idx_buf_)
65 free(lma_idx_buf_);
66 lma_idx_buf_ = NULL;
67
68 if (free_dict_list) {
69 if (NULL != dict_list_) {
70 delete dict_list_;
71 }
72 dict_list_ = NULL;
73 }
74
75 if (parsing_marks_)
76 delete [] parsing_marks_;
77 parsing_marks_ = NULL;
78
79 if (mile_stones_)
80 delete [] mile_stones_;
81 mile_stones_ = NULL;
82
83 reset_milestones(0, kFirstValidMileStoneHandle);
84 }
85
get_son_offset(const LmaNodeGE1 * node)86 inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) {
87 return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16));
88 }
89
get_homo_idx_buf_offset(const LmaNodeGE1 * node)90 inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) {
91 return ((size_t)node->homo_idx_buf_off_l +
92 ((size_t)node->homo_idx_buf_off_h << 16));
93 }
94
get_lemma_id(size_t id_offset)95 inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) {
96 LemmaIdType id = 0;
97 for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--)
98 id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos];
99 id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize];
100 return id;
101 }
102
103 #ifdef ___BUILD_MODEL___
build_dict(const char * fn_raw,const char * fn_validhzs)104 bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) {
105 DictBuilder* dict_builder = new DictBuilder();
106
107 free_resource(true);
108
109 return dict_builder->build_dict(fn_raw, fn_validhzs, this);
110 }
111
save_dict(FILE * fp)112 bool DictTrie::save_dict(FILE *fp) {
113 if (NULL == fp)
114 return false;
115
116 if (fwrite(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
117 return false;
118
119 if (fwrite(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
120 return false;
121
122 if (fwrite(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
123 return false;
124
125 if (fwrite(&top_lmas_num_, sizeof(uint32), 1, fp) != 1)
126 return false;
127
128 if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
129 != lma_node_num_le0_)
130 return false;
131
132 if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
133 != lma_node_num_ge1_)
134 return false;
135
136 if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
137 lma_idx_buf_len_)
138 return false;
139
140 return true;
141 }
142
save_dict(const char * filename)143 bool DictTrie::save_dict(const char *filename) {
144 if (NULL == filename)
145 return false;
146
147 if (NULL == root_ || NULL == dict_list_)
148 return false;
149
150 SpellingTrie &spl_trie = SpellingTrie::get_instance();
151 NGram &ngram = NGram::get_instance();
152
153 FILE *fp = fopen(filename, "wb");
154 if (NULL == fp)
155 return false;
156
157 if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) ||
158 !save_dict(fp) || !ngram.save_ngram(fp)) {
159 fclose(fp);
160 return false;
161 }
162
163 fclose(fp);
164 return true;
165 }
166 #endif // ___BUILD_MODEL___
167
load_dict(FILE * fp)168 bool DictTrie::load_dict(FILE *fp) {
169 if (NULL == fp)
170 return false;
171 if (fread(&lma_node_num_le0_, sizeof(uint32), 1, fp) != 1)
172 return false;
173
174 if (fread(&lma_node_num_ge1_, sizeof(uint32), 1, fp) != 1)
175 return false;
176
177 if (fread(&lma_idx_buf_len_, sizeof(uint32), 1, fp) != 1)
178 return false;
179
180 if (fread(&top_lmas_num_, sizeof(uint32), 1, fp) != 1 ||
181 top_lmas_num_ >= lma_idx_buf_len_)
182 return false;
183
184 free_resource(false);
185
186 root_ = static_cast<LmaNodeLE0*>
187 (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0)));
188 nodes_ge1_ = static_cast<LmaNodeGE1*>
189 (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1)));
190 lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_);
191 total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize;
192
193 size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1;
194 assert(lma_node_num_le0_ <= buf_size);
195 splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16)));
196
197 // Init the space for parsing.
198 parsing_marks_ = new ParsingMark[kMaxParsingMark];
199 mile_stones_ = new MileStone[kMaxMileStone];
200 reset_milestones(0, kFirstValidMileStoneHandle);
201
202 if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ ||
203 NULL == splid_le0_index_ || NULL == parsing_marks_ ||
204 NULL == mile_stones_) {
205 free_resource(false);
206 return false;
207 }
208
209 if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp)
210 != lma_node_num_le0_)
211 return false;
212
213 if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp)
214 != lma_node_num_ge1_)
215 return false;
216
217 if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) !=
218 lma_idx_buf_len_)
219 return false;
220
221 // The quick index for the first level sons
222 uint16 last_splid = kFullSplIdStart;
223 size_t last_pos = 0;
224 for (size_t i = 1; i < lma_node_num_le0_; i++) {
225 for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++)
226 splid_le0_index_[splid - kFullSplIdStart] = last_pos;
227
228 splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] =
229 static_cast<uint16>(i);
230 last_splid = root_[i].spl_idx;
231 last_pos = i;
232 }
233
234 for (uint16 splid = last_splid + 1;
235 splid < buf_size + kFullSplIdStart; splid++) {
236 assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size);
237 splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1;
238 }
239
240 return true;
241 }
242
load_dict(const char * filename,LemmaIdType start_id,LemmaIdType end_id)243 bool DictTrie::load_dict(const char *filename, LemmaIdType start_id,
244 LemmaIdType end_id) {
245 if (NULL == filename || end_id <= start_id)
246 return false;
247
248 FILE *fp = fopen(filename, "rb");
249 if (NULL == fp)
250 return false;
251
252 free_resource(true);
253
254 dict_list_ = new DictList();
255 if (NULL == dict_list_) {
256 fclose(fp);
257 return false;
258 }
259
260 SpellingTrie &spl_trie = SpellingTrie::get_instance();
261 NGram &ngram = NGram::get_instance();
262
263 if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
264 !load_dict(fp) || !ngram.load_ngram(fp) ||
265 total_lma_num_ > end_id - start_id + 1) {
266 free_resource(true);
267 fclose(fp);
268 return false;
269 }
270
271 fclose(fp);
272 return true;
273 }
274
load_dict_fd(int sys_fd,long start_offset,long length,LemmaIdType start_id,LemmaIdType end_id)275 bool DictTrie::load_dict_fd(int sys_fd, long start_offset,
276 long length, LemmaIdType start_id,
277 LemmaIdType end_id) {
278 if (start_offset < 0 || length <= 0 || end_id <= start_id)
279 return false;
280
281 FILE *fp = fdopen(sys_fd, "rb");
282 if (NULL == fp)
283 return false;
284
285 if (-1 == fseek(fp, start_offset, SEEK_SET)) {
286 fclose(fp);
287 return false;
288 }
289
290 free_resource(true);
291
292 dict_list_ = new DictList();
293 if (NULL == dict_list_) {
294 fclose(fp);
295 return false;
296 }
297
298 SpellingTrie &spl_trie = SpellingTrie::get_instance();
299 NGram &ngram = NGram::get_instance();
300
301 if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) ||
302 !load_dict(fp) || !ngram.load_ngram(fp) ||
303 ftell(fp) < start_offset + length ||
304 total_lma_num_ > end_id - start_id + 1) {
305 free_resource(true);
306 fclose(fp);
307 return false;
308 }
309
310 fclose(fp);
311 return true;
312 }
313
fill_lpi_buffer(LmaPsbItem lpi_items[],size_t lpi_max,LmaNodeLE0 * node)314 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
315 LmaNodeLE0 *node) {
316 size_t lpi_num = 0;
317 NGram& ngram = NGram::get_instance();
318 for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
319 lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off +
320 homo);
321 lpi_items[lpi_num].lma_len = 1;
322 lpi_items[lpi_num].psb =
323 static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
324 lpi_num++;
325 if (lpi_num >= lpi_max)
326 break;
327 }
328
329 return lpi_num;
330 }
331
fill_lpi_buffer(LmaPsbItem lpi_items[],size_t lpi_max,size_t homo_buf_off,LmaNodeGE1 * node,uint16 lma_len)332 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max,
333 size_t homo_buf_off, LmaNodeGE1 *node,
334 uint16 lma_len) {
335 size_t lpi_num = 0;
336 NGram& ngram = NGram::get_instance();
337 for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) {
338 lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo);
339 lpi_items[lpi_num].lma_len = lma_len;
340 lpi_items[lpi_num].psb =
341 static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id));
342 lpi_num++;
343 if (lpi_num >= lpi_max)
344 break;
345 }
346
347 return lpi_num;
348 }
349
reset_milestones(uint16 from_step,MileStoneHandle from_handle)350 void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
351 if (0 == from_step) {
352 parsing_marks_pos_ = 0;
353 mile_stones_pos_ = kFirstValidMileStoneHandle;
354 } else {
355 if (from_handle > 0 && from_handle < mile_stones_pos_) {
356 mile_stones_pos_ = from_handle;
357
358 MileStone *mile_stone = mile_stones_ + from_handle;
359 parsing_marks_pos_ = mile_stone->mark_start;
360 }
361 }
362 }
363
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)364 MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle,
365 const DictExtPara *dep,
366 LmaPsbItem *lpi_items, size_t lpi_max,
367 size_t *lpi_num) {
368 if (NULL == dep)
369 return 0;
370
371 // from LmaNodeLE0 (root) to LmaNodeLE0
372 if (0 == from_handle) {
373 assert(0 == dep->splids_extended);
374 return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num);
375 }
376
377 // from LmaNodeLE0 to LmaNodeGE1
378 if (1 == dep->splids_extended)
379 return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num);
380
381 // From LmaNodeGE1 to LmaNodeGE1
382 return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num);
383 }
384
extend_dict0(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)385 MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle,
386 const DictExtPara *dep,
387 LmaPsbItem *lpi_items,
388 size_t lpi_max, size_t *lpi_num) {
389 assert(NULL != dep && 0 == from_handle);
390 *lpi_num = 0;
391 MileStoneHandle ret_handle = 0;
392
393 uint16 splid = dep->splids[dep->splids_extended];
394 uint16 id_start = dep->id_start;
395 uint16 id_num = dep->id_num;
396
397 LpiCache& lpi_cache = LpiCache::get_instance();
398 bool cached = lpi_cache.is_cached(splid);
399
400 // 2. Begin exgtending
401 // 2.1 Get the LmaPsbItem list
402 LmaNodeLE0 *node = root_;
403 size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
404 size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart];
405 for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
406 assert(1 == node->son_1st_off);
407 LmaNodeLE0 *son = root_ + son_pos;
408 assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num);
409
410 if (!cached && *lpi_num < lpi_max) {
411 bool need_lpi = true;
412 if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start)
413 need_lpi = false;
414
415 if (need_lpi)
416 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
417 lpi_max - *lpi_num, son);
418 }
419
420 // If necessary, fill in a new mile stone.
421 if (son->spl_idx == id_start) {
422 if (mile_stones_pos_ < kMaxMileStone &&
423 parsing_marks_pos_ < kMaxParsingMark) {
424 parsing_marks_[parsing_marks_pos_].node_offset = son_pos;
425 parsing_marks_[parsing_marks_pos_].node_num = id_num;
426 mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_;
427 mile_stones_[mile_stones_pos_].mark_num = 1;
428 ret_handle = mile_stones_pos_;
429 parsing_marks_pos_++;
430 mile_stones_pos_++;
431 }
432 }
433
434 if (son->spl_idx >= id_start + id_num -1)
435 break;
436 }
437
438 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
439 // mile_stones_pos_);
440 return ret_handle;
441 }
442
extend_dict1(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)443 MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle,
444 const DictExtPara *dep,
445 LmaPsbItem *lpi_items,
446 size_t lpi_max, size_t *lpi_num) {
447 assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
448
449 MileStoneHandle ret_handle = 0;
450
451 // 1. If this is a half Id, get its corresponding full starting Id and
452 // number of full Id.
453 size_t ret_val = 0;
454
455 uint16 id_start = dep->id_start;
456 uint16 id_num = dep->id_num;
457
458 // 2. Begin extending.
459 MileStone *mile_stone = mile_stones_ + from_handle;
460
461 for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
462 ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
463 uint16 ext_num = p_mark.node_num;
464 for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
465 LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos;
466 size_t found_start = 0;
467 size_t found_num = 0;
468 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
469 assert(node->son_1st_off <= lma_node_num_ge1_);
470 LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos;
471 if (son->spl_idx >= id_start
472 && son->spl_idx < id_start + id_num) {
473 if (*lpi_num < lpi_max) {
474 size_t homo_buf_off = get_homo_idx_buf_offset(son);
475 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
476 lpi_max - *lpi_num, homo_buf_off, son,
477 2);
478 }
479
480 // If necessary, fill in the new DTMI
481 if (0 == found_num) {
482 found_start = son_pos;
483 }
484 found_num++;
485 }
486 if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
487 (size_t)node->num_of_son - 1) {
488 if (found_num > 0) {
489 if (mile_stones_pos_ < kMaxMileStone &&
490 parsing_marks_pos_ < kMaxParsingMark) {
491 parsing_marks_[parsing_marks_pos_].node_offset =
492 node->son_1st_off + found_start;
493 parsing_marks_[parsing_marks_pos_].node_num = found_num;
494 if (0 == ret_val)
495 mile_stones_[mile_stones_pos_].mark_start =
496 parsing_marks_pos_;
497 parsing_marks_pos_++;
498 }
499
500 ret_val++;
501 }
502 break;
503 } // for son_pos
504 } // for ext_pos
505 } // for h_pos
506 }
507
508 if (ret_val > 0) {
509 mile_stones_[mile_stones_pos_].mark_num = ret_val;
510 ret_handle = mile_stones_pos_;
511 mile_stones_pos_++;
512 ret_val = 1;
513 }
514
515 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
516 // mile_stones_pos_);
517 return ret_handle;
518 }
519
extend_dict2(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)520 MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle,
521 const DictExtPara *dep,
522 LmaPsbItem *lpi_items,
523 size_t lpi_max, size_t *lpi_num) {
524 assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_);
525
526 MileStoneHandle ret_handle = 0;
527
528 // 1. If this is a half Id, get its corresponding full starting Id and
529 // number of full Id.
530 size_t ret_val = 0;
531
532 uint16 id_start = dep->id_start;
533 uint16 id_num = dep->id_num;
534
535 // 2. Begin extending.
536 MileStone *mile_stone = mile_stones_ + from_handle;
537
538 for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) {
539 ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos];
540 uint16 ext_num = p_mark.node_num;
541 for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) {
542 LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos;
543 size_t found_start = 0;
544 size_t found_num = 0;
545
546 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) {
547 assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
548 LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos;
549 if (son->spl_idx >= id_start
550 && son->spl_idx < id_start + id_num) {
551 if (*lpi_num < lpi_max) {
552 size_t homo_buf_off = get_homo_idx_buf_offset(son);
553 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num),
554 lpi_max - *lpi_num, homo_buf_off, son,
555 dep->splids_extended + 1);
556 }
557
558 // If necessary, fill in the new DTMI
559 if (0 == found_num) {
560 found_start = son_pos;
561 }
562 found_num++;
563 }
564 if (son->spl_idx >= id_start + id_num - 1 || son_pos ==
565 (size_t)node->num_of_son - 1) {
566 if (found_num > 0) {
567 if (mile_stones_pos_ < kMaxMileStone &&
568 parsing_marks_pos_ < kMaxParsingMark) {
569 parsing_marks_[parsing_marks_pos_].node_offset =
570 get_son_offset(node) + found_start;
571 parsing_marks_[parsing_marks_pos_].node_num = found_num;
572 if (0 == ret_val)
573 mile_stones_[mile_stones_pos_].mark_start =
574 parsing_marks_pos_;
575 parsing_marks_pos_++;
576 }
577
578 ret_val++;
579 }
580 break;
581 }
582 } // for son_pos
583 } // for ext_pos
584 } // for h_pos
585
586 if (ret_val > 0) {
587 mile_stones_[mile_stones_pos_].mark_num = ret_val;
588 ret_handle = mile_stones_pos_;
589 mile_stones_pos_++;
590 }
591
592 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_,
593 // mile_stones_pos_);
594 return ret_handle;
595 }
596
try_extend(const uint16 * splids,uint16 splid_num,LemmaIdType id_lemma)597 bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num,
598 LemmaIdType id_lemma) {
599 if (0 == splid_num || NULL == splids)
600 return false;
601
602 void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart];
603
604 for (uint16 pos = 1; pos < splid_num; pos++) {
605 if (1 == pos) {
606 LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
607 LmaNodeGE1 *node_son;
608 uint16 son_pos;
609 for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son);
610 son_pos++) {
611 assert(node_le0->son_1st_off <= lma_node_num_ge1_);
612 node_son = nodes_ge1_ + node_le0->son_1st_off
613 + son_pos;
614 if (node_son->spl_idx == splids[pos])
615 break;
616 }
617 if (son_pos < node_le0->num_of_son)
618 node = reinterpret_cast<void*>(node_son);
619 else
620 return false;
621 } else {
622 LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
623 LmaNodeGE1 *node_son;
624 uint16 son_pos;
625 for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son);
626 son_pos++) {
627 assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0);
628 node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos;
629 if (node_son->spl_idx == splids[pos])
630 break;
631 }
632 if (son_pos < node_ge1->num_of_son)
633 node = reinterpret_cast<void*>(node_son);
634 else
635 return false;
636 }
637 }
638
639 if (1 == splid_num) {
640 LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node);
641 size_t num_of_homo = (size_t)node_le0->num_of_homo;
642 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
643 LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
644 char16 str[2];
645 get_lemma_str(id_this, str, 2);
646 if (id_this == id_lemma)
647 return true;
648 }
649 } else {
650 LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node);
651 size_t num_of_homo = (size_t)node_ge1->num_of_homo;
652 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
653 size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
654 if (get_lemma_id(node_homo_off + homo_pos) == id_lemma)
655 return true;
656 }
657 }
658
659 return false;
660 }
661
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf)662 size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len,
663 LmaPsbItem* lma_buf, size_t max_lma_buf) {
664 if (splid_str_len > kMaxLemmaSize)
665 return 0;
666
667 #define MAX_EXTENDBUF_LEN 200
668
669 size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment
670 size_t* node_buf2[MAX_EXTENDBUF_LEN];
671 LmaNodeLE0** node_fr_le0 =
672 reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from.
673 LmaNodeLE0** node_to_le0 =
674 reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to.
675 LmaNodeGE1** node_fr_ge1 = NULL;
676 LmaNodeGE1** node_to_ge1 = NULL;
677 size_t node_fr_num = 1;
678 size_t node_to_num = 0;
679 node_fr_le0[0] = root_;
680 if (NULL == node_fr_le0[0])
681 return 0;
682
683 size_t spl_pos = 0;
684
685 while (spl_pos < splid_str_len) {
686 uint16 id_num = 1;
687 uint16 id_start = splid_str[spl_pos];
688 // If it is a half id
689 if (spl_trie_->is_half_id(splid_str[spl_pos])) {
690 id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start);
691 assert(id_num > 0);
692 }
693
694 // Extend the nodes
695 if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes
696 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
697 LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
698 assert(node == root_ && 1 == node_fr_num);
699 size_t son_start = splid_le0_index_[id_start - kFullSplIdStart];
700 size_t son_end =
701 splid_le0_index_[id_start + id_num - kFullSplIdStart];
702 for (size_t son_pos = son_start; son_pos < son_end; son_pos++) {
703 assert(1 == node->son_1st_off);
704 LmaNodeLE0 *node_son = root_ + son_pos;
705 assert(node_son->spl_idx >= id_start
706 && node_son->spl_idx < id_start + id_num);
707 if (node_to_num < MAX_EXTENDBUF_LEN) {
708 node_to_le0[node_to_num] = node_son;
709 node_to_num++;
710 }
711 // id_start + id_num - 1 is the last one, which has just been
712 // recorded.
713 if (node_son->spl_idx >= id_start + id_num - 1)
714 break;
715 }
716 }
717
718 spl_pos++;
719 if (spl_pos >= splid_str_len || node_to_num == 0)
720 break;
721 // Prepare the nodes for next extending
722 // next time, from LmaNodeLE0 to LmaNodeGE1
723 LmaNodeLE0** node_tmp = node_fr_le0;
724 node_fr_le0 = node_to_le0;
725 node_to_le0 = NULL;
726 node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp);
727 } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes
728 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
729 LmaNodeLE0 *node = node_fr_le0[node_fr_pos];
730 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
731 son_pos++) {
732 assert(node->son_1st_off <= lma_node_num_ge1_);
733 LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off
734 + son_pos;
735 if (node_son->spl_idx >= id_start
736 && node_son->spl_idx < id_start + id_num) {
737 if (node_to_num < MAX_EXTENDBUF_LEN) {
738 node_to_ge1[node_to_num] = node_son;
739 node_to_num++;
740 }
741 }
742 // id_start + id_num - 1 is the last one, which has just been
743 // recorded.
744 if (node_son->spl_idx >= id_start + id_num - 1)
745 break;
746 }
747 }
748
749 spl_pos++;
750 if (spl_pos >= splid_str_len || node_to_num == 0)
751 break;
752 // Prepare the nodes for next extending
753 // next time, from LmaNodeGE1 to LmaNodeGE1
754 node_fr_ge1 = node_to_ge1;
755 node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0);
756 node_fr_le0 = NULL;
757 node_to_le0 = NULL;
758 } else { // From LmaNodeGE1 to LmaNodeGE1 nodes
759 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) {
760 LmaNodeGE1 *node = node_fr_ge1[node_fr_pos];
761 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son;
762 son_pos++) {
763 assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0);
764 LmaNodeGE1 *node_son = nodes_ge1_
765 + get_son_offset(node) + son_pos;
766 if (node_son->spl_idx >= id_start
767 && node_son->spl_idx < id_start + id_num) {
768 if (node_to_num < MAX_EXTENDBUF_LEN) {
769 node_to_ge1[node_to_num] = node_son;
770 node_to_num++;
771 }
772 }
773 // id_start + id_num - 1 is the last one, which has just been
774 // recorded.
775 if (node_son->spl_idx >= id_start + id_num - 1)
776 break;
777 }
778 }
779
780 spl_pos++;
781 if (spl_pos >= splid_str_len || node_to_num == 0)
782 break;
783 // Prepare the nodes for next extending
784 // next time, from LmaNodeGE1 to LmaNodeGE1
785 LmaNodeGE1 **node_tmp = node_fr_ge1;
786 node_fr_ge1 = node_to_ge1;
787 node_to_ge1 = node_tmp;
788 }
789
790 // The number of node for next extending
791 node_fr_num = node_to_num;
792 node_to_num = 0;
793 } // while
794
795 if (0 == node_to_num)
796 return 0;
797
798 NGram &ngram = NGram::get_instance();
799 size_t lma_num = 0;
800
801 // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e',
802 // only those candidates for the full matched one-char id will be returned.
803 if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0]))
804 node_to_num = node_to_num > 0 ? 1 : 0;
805
806 for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) {
807 size_t num_of_homo = 0;
808 if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes
809 LmaNodeLE0* node_le0 = node_to_le0[node_pos];
810 num_of_homo = (size_t)node_le0->num_of_homo;
811 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
812 size_t ch_pos = lma_num + homo_pos;
813 lma_buf[ch_pos].id =
814 get_lemma_id(node_le0->homo_idx_buf_off + homo_pos);
815 lma_buf[ch_pos].lma_len = 1;
816 lma_buf[ch_pos].psb =
817 static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
818
819 if (lma_num + homo_pos >= max_lma_buf - 1)
820 break;
821 }
822 } else { // Get from LmaNodeGE1 nodes
823 LmaNodeGE1* node_ge1 = node_to_ge1[node_pos];
824 num_of_homo = (size_t)node_ge1->num_of_homo;
825 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) {
826 size_t ch_pos = lma_num + homo_pos;
827 size_t node_homo_off = get_homo_idx_buf_offset(node_ge1);
828 lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos);
829 lma_buf[ch_pos].lma_len = splid_str_len;
830 lma_buf[ch_pos].psb =
831 static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id));
832
833 if (lma_num + homo_pos >= max_lma_buf - 1)
834 break;
835 }
836 }
837
838 lma_num += num_of_homo;
839 if (lma_num >= max_lma_buf) {
840 lma_num = max_lma_buf;
841 break;
842 }
843 }
844 return lma_num;
845 }
846
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)847 uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
848 uint16 str_max) {
849 return dict_list_->get_lemma_str(id_lemma, str_buf, str_max);
850 }
851
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)852 uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
853 uint16 splids_max, bool arg_valid) {
854 char16 lma_str[kMaxLemmaSize + 1];
855 uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1);
856 assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max);
857
858 uint16 spl_mtrx[kMaxLemmaSize * 5];
859 uint16 spl_start[kMaxLemmaSize + 1];
860 spl_start[0] = 0;
861 uint16 try_num = 1;
862
863 for (uint16 pos = 0; pos < lma_len; pos++) {
864 uint16 cand_splids_this = 0;
865 if (arg_valid && spl_trie_->is_full_id(splids[pos])) {
866 spl_mtrx[spl_start[pos]] = splids[pos];
867 cand_splids_this = 1;
868 } else {
869 cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos],
870 arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos],
871 kMaxLemmaSize * 5 - spl_start[pos]);
872 assert(cand_splids_this > 0);
873 }
874 spl_start[pos + 1] = spl_start[pos] + cand_splids_this;
875 try_num *= cand_splids_this;
876 }
877
878 for (uint16 try_pos = 0; try_pos < try_num; try_pos++) {
879 uint16 mod = 1;
880 for (uint16 pos = 0; pos < lma_len; pos++) {
881 uint16 radix = spl_start[pos + 1] - spl_start[pos];
882 splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix];
883 mod *= radix;
884 }
885
886 if (try_extend(splids, lma_len, id_lemma))
887 return lma_len;
888 }
889
890 return 0;
891 }
892
set_total_lemma_count_of_others(size_t count)893 void DictTrie::set_total_lemma_count_of_others(size_t count) {
894 NGram& ngram = NGram::get_instance();
895 ngram.set_total_freq_none_sys(count);
896 }
897
convert_to_hanzis(char16 * str,uint16 str_len)898 void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) {
899 return dict_list_->convert_to_hanzis(str, str_len);
900 }
901
convert_to_scis_ids(char16 * str,uint16 str_len)902 void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) {
903 return dict_list_->convert_to_scis_ids(str, str_len);
904 }
905
get_lemma_id(const char16 lemma_str[],uint16 lemma_len)906 LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) {
907 if (NULL == lemma_str || lemma_len > kMaxLemmaSize)
908 return 0;
909
910 return dict_list_->get_lemma_id(lemma_str, lemma_len);
911 }
912
predict_top_lmas(size_t his_len,NPredictItem * npre_items,size_t npre_max,size_t b4_used)913 size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items,
914 size_t npre_max, size_t b4_used) {
915 NGram &ngram = NGram::get_instance();
916
917 size_t item_num = 0;
918 size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_;
919 size_t top_lmas_pos = 0;
920 while (item_num < npre_max && top_lmas_pos < top_lmas_num_) {
921 memset(npre_items + item_num, 0, sizeof(NPredictItem));
922 LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos);
923 top_lmas_pos += 1;
924 if (dict_list_->get_lemma_str(top_lma_id,
925 npre_items[item_num].pre_hzs,
926 kMaxLemmaSize - 1) == 0) {
927 continue;
928 }
929 npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id);
930 npre_items[item_num].his_len = his_len;
931 item_num++;
932 }
933 return item_num;
934 }
935
predict(const char16 * last_hzs,uint16 hzs_len,NPredictItem * npre_items,size_t npre_max,size_t b4_used)936 size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len,
937 NPredictItem *npre_items, size_t npre_max,
938 size_t b4_used) {
939 return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used);
940 }
941 } // namespace ime_pinyin
942