1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "../include/userdict.h"
18 #include "../include/splparser.h"
19 #include "../include/ngram.h"
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #ifdef ___DEBUG_PERF___
24 #include <cutils/log.h>
25 #endif
26 #ifdef _WIN32
27 #include <io.h>
28 #else
29 #include <unistd.h>
30 #endif
31 #include <fcntl.h>
32 #include <sys/stat.h>
33 #include <assert.h>
34 #include <ctype.h>
35 #include <sys/types.h>
36 #ifndef _WIN32
37 #include <sys/time.h>
38 #endif
39 #include <time.h>
40 #ifdef _WIN32
41 #undef max
42 #undef min
43 #include <QDateTime>
44 #include <QMutex>
45 #else
46 #include <pthread.h>
47 #endif
48 #include <math.h>
49
50 namespace ime_pinyin {
51
52 #ifdef _WIN32
gettimeofday(struct timeval * tp,void *)53 static int gettimeofday(struct timeval *tp, void *) {
54 const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch();
55 tp->tv_sec = (long)(current_msecs_since_epoch / 1000);
56 tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000);
57 return 0;
58 }
59 #endif
60
61 #ifdef ___DEBUG_PERF___
62 static uint64 _ellapse_ = 0;
63 static struct timeval _tv_start_, _tv_end_;
64 #define DEBUG_PERF_BEGIN \
65 do { \
66 gettimeofday(&_tv_start_, NULL); \
67 } while (0)
68 #define DEBUG_PERF_END \
69 do { \
70 gettimeofday(&_tv_end_, NULL); \
71 _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
72 (_tv_end_.tv_usec - _tv_start_.tv_usec); \
73 } while (0)
74 #define LOGD_PERF(message) \
75 ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
76 #else
77 #define DEBUG_PERF_BEGIN
78 #define DEBUG_PERF_END
79 #define LOGD_PERF(message)
80 #endif
81
82 // XXX File load and write are thread-safe by g_mutex_
83 #ifdef _WIN32
84 static QMutex g_mutex_;
85 #define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
86 #define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
87 #define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0))
88 #else
89 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
90 #endif
91 static struct timeval g_last_update_ = {0, 0};
92
get_dict_file_size(UserDictInfo * info)93 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
94 return (4 + info->lemma_size + (info->lemma_count << 3)
95 #ifdef ___PREDICT_ENABLED___
96 + (info->lemma_count << 2)
97 #endif
98 #ifdef ___SYNC_ENABLED___
99 + (info->sync_count << 2)
100 #endif
101 + sizeof(*info));
102 }
103
translate_score(int raw_score)104 inline LmaScoreType UserDict::translate_score(int raw_score) {
105 // 1) ori_freq: original user frequency
106 uint32 ori_freq = extract_score_freq(raw_score);
107 // 2) lmt_off: lmt index (week offset for example)
108 uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
109 if (kUserDictLMTBitWidth < 16) {
110 uint64 mask = ~(1 << kUserDictLMTBitWidth);
111 lmt_off &= mask;
112 }
113 // 3) now_off: current time index (current week offset for example)
114 // assuming load_time_ is around current time
115 uint64 now_off = load_time_.tv_sec;
116 now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
117 now_off = (now_off << (64 - kUserDictLMTBitWidth));
118 now_off = (now_off >> (64 - kUserDictLMTBitWidth));
119 // 4) factor: decide expand-factor
120 int delta = now_off - lmt_off;
121 if (delta > 4)
122 delta = 4;
123 int factor = 80 - (delta << 4);
124
125 double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
126 return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
127 * NGram::kLogValueAmplifier);
128 }
129
extract_score_freq(int raw_score)130 inline int UserDict::extract_score_freq(int raw_score) {
131 // Frequence stored in lowest 16 bits
132 int freq = (raw_score & 0x0000ffff);
133 return freq;
134 }
135
extract_score_lmt(int raw_score)136 inline uint64 UserDict::extract_score_lmt(int raw_score) {
137 uint64 lmt = ((raw_score & 0xffff0000) >> 16);
138 if (kUserDictLMTBitWidth < 16) {
139 uint64 mask = ~(1 << kUserDictLMTBitWidth);
140 lmt &= mask;
141 }
142 lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
143 return lmt;
144 }
145
build_score(uint64 lmt,int freq)146 inline int UserDict::build_score(uint64 lmt, int freq) {
147 lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
148 lmt = (lmt << (64 - kUserDictLMTBitWidth));
149 lmt = (lmt >> (64 - kUserDictLMTBitWidth));
150 uint16 lmt16 = (uint16)lmt;
151 int s = freq;
152 s &= 0x0000ffff;
153 s = (lmt16 << 16) | s;
154 return s;
155 }
156
utf16le_atoll(uint16 * s,int len)157 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
158 int64 ret = 0;
159 if (len <= 0)
160 return ret;
161
162 int flag = 1;
163 const uint16 * endp = s + len;
164 if (*s == '-') {
165 flag = -1;
166 s++;
167 } else if (*s == '+') {
168 s++;
169 }
170
171 while (*s >= '0' && *s <= '9' && s < endp) {
172 ret += ret * 10 + (*s) - '0';
173 s++;
174 }
175 return ret * flag;
176 }
177
utf16le_lltoa(int64 v,uint16 * s,int size)178 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
179 if (!s || size <= 0)
180 return 0;
181 uint16 *endp = s + size;
182 int ret_len = 0;
183 if (v < 0) {
184 *(s++) = '-';
185 ++ret_len;
186 v *= -1;
187 }
188
189 uint16 *b = s;
190 while (s < endp && v != 0) {
191 *(s++) = '0' + (v % 10);
192 v = v / 10;
193 ++ret_len;
194 }
195
196 if (v != 0)
197 return 0;
198
199 --s;
200
201 while (b < s) {
202 *b = *s;
203 ++b, --s;
204 }
205
206 return ret_len;
207 }
208
set_lemma_flag(uint32 offset,uint8 flag)209 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
210 offset &= kUserDictOffsetMask;
211 lemmas_[offset] |= flag;
212 }
213
get_lemma_flag(uint32 offset)214 inline char UserDict::get_lemma_flag(uint32 offset) {
215 offset &= kUserDictOffsetMask;
216 return (char)(lemmas_[offset]);
217 }
218
get_lemma_nchar(uint32 offset)219 inline char UserDict::get_lemma_nchar(uint32 offset) {
220 offset &= kUserDictOffsetMask;
221 return (char)(lemmas_[offset + 1]);
222 }
223
get_lemma_spell_ids(uint32 offset)224 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
225 offset &= kUserDictOffsetMask;
226 return (uint16 *)(lemmas_ + offset + 2);
227 }
228
get_lemma_word(uint32 offset)229 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
230 offset &= kUserDictOffsetMask;
231 uint8 nchar = get_lemma_nchar(offset);
232 return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
233 }
234
get_max_lemma_id()235 inline LemmaIdType UserDict::get_max_lemma_id() {
236 // When a lemma is deleted, we don't not claim its id back for
237 // simplicity and performance
238 return start_id_ + dict_info_.lemma_count - 1;
239 }
240
is_valid_lemma_id(LemmaIdType id)241 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
242 if (id >= start_id_ && id <= get_max_lemma_id())
243 return true;
244 return false;
245 }
246
is_valid_state()247 inline bool UserDict::is_valid_state() {
248 if (state_ == USER_DICT_NONE)
249 return false;
250 return true;
251 }
252
UserDict()253 UserDict::UserDict()
254 : start_id_(0),
255 version_(0),
256 lemmas_(NULL),
257 offsets_(NULL),
258 scores_(NULL),
259 ids_(NULL),
260 #ifdef ___PREDICT_ENABLED___
261 predicts_(NULL),
262 #endif
263 #ifdef ___SYNC_ENABLED___
264 syncs_(NULL),
265 sync_count_size_(0),
266 #endif
267 offsets_by_id_(NULL),
268 lemma_count_left_(0),
269 lemma_size_left_(0),
270 dict_file_(NULL),
271 state_(USER_DICT_NONE) {
272 memset(&dict_info_, 0, sizeof(dict_info_));
273 memset(&load_time_, 0, sizeof(load_time_));
274 #ifdef ___CACHE_ENABLED___
275 cache_init();
276 #endif
277 }
278
~UserDict()279 UserDict::~UserDict() {
280 close_dict();
281 }
282
load_dict(const char * file_name,LemmaIdType start_id,LemmaIdType end_id)283 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
284 LemmaIdType end_id) {
285 #ifdef ___DEBUG_PERF___
286 DEBUG_PERF_BEGIN;
287 #endif
288 dict_file_ = strdup(file_name);
289 if (!dict_file_)
290 return false;
291
292 start_id_ = start_id;
293
294 if (false == validate(file_name) && false == reset(file_name)) {
295 goto error;
296 }
297 if (false == load(file_name, start_id)) {
298 goto error;
299 }
300
301 state_ = USER_DICT_SYNC;
302
303 gettimeofday(&load_time_, NULL);
304
305 #ifdef ___DEBUG_PERF___
306 DEBUG_PERF_END;
307 LOGD_PERF("load_dict");
308 #endif
309 return true;
310 error:
311 free((void*)dict_file_);
312 dict_file_ = NULL;
313 start_id_ = 0;
314 return false;
315 }
316
close_dict()317 bool UserDict::close_dict() {
318 if (state_ == USER_DICT_NONE)
319 return true;
320 if (state_ == USER_DICT_SYNC)
321 goto out;
322
323 // If dictionary is written back by others,
324 // we can not simply write back here
325 // To do a safe flush, we have to discard all newly added
326 // lemmas and try to reload dict file.
327 pthread_mutex_lock(&g_mutex_);
328 if (load_time_.tv_sec > g_last_update_.tv_sec ||
329 (load_time_.tv_sec == g_last_update_.tv_sec &&
330 load_time_.tv_usec > g_last_update_.tv_usec)) {
331 write_back();
332 gettimeofday(&g_last_update_, NULL);
333 }
334 pthread_mutex_unlock(&g_mutex_);
335
336 out:
337 free((void*)dict_file_);
338 free(lemmas_);
339 free(offsets_);
340 free(offsets_by_id_);
341 free(scores_);
342 free(ids_);
343 #ifdef ___PREDICT_ENABLED___
344 free(predicts_);
345 #endif
346
347 version_ = 0;
348 dict_file_ = NULL;
349 lemmas_ = NULL;
350 #ifdef ___SYNC_ENABLED___
351 syncs_ = NULL;
352 sync_count_size_ = 0;
353 #endif
354 offsets_ = NULL;
355 offsets_by_id_ = NULL;
356 scores_ = NULL;
357 ids_ = NULL;
358 #ifdef ___PREDICT_ENABLED___
359 predicts_ = NULL;
360 #endif
361
362 memset(&dict_info_, 0, sizeof(dict_info_));
363 lemma_count_left_ = 0;
364 lemma_size_left_ = 0;
365 state_ = USER_DICT_NONE;
366
367 return true;
368 }
369
number_of_lemmas()370 size_t UserDict::number_of_lemmas() {
371 return dict_info_.lemma_count;
372 }
373
reset_milestones(uint16 from_step,MileStoneHandle from_handle)374 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
375 return;
376 }
377
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)378 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
379 const DictExtPara *dep,
380 LmaPsbItem *lpi_items,
381 size_t lpi_max, size_t *lpi_num) {
382 if (is_valid_state() == false)
383 return 0;
384
385 bool need_extend = false;
386
387 #ifdef ___DEBUG_PERF___
388 DEBUG_PERF_BEGIN;
389 #endif
390 *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
391 lpi_items, lpi_max, &need_extend);
392 #ifdef ___DEBUG_PERF___
393 DEBUG_PERF_END;
394 LOGD_PERF("extend_dict");
395 #endif
396 return ((*lpi_num > 0 || need_extend) ? 1 : 0);
397 }
398
is_fuzzy_prefix_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)399 int UserDict::is_fuzzy_prefix_spell_id(
400 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
401 if (len1 < searchable->splids_len)
402 return 0;
403
404 SpellingTrie &spl_trie = SpellingTrie::get_instance();
405 uint32 i = 0;
406 for (i = 0; i < searchable->splids_len; i++) {
407 const char py1 = *spl_trie.get_spelling_str(id1[i]);
408 uint16 off = 8 * (i % 4);
409 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
410 if (py1 == py2)
411 continue;
412 return 0;
413 }
414 return 1;
415 }
416
fuzzy_compare_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)417 int UserDict::fuzzy_compare_spell_id(
418 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
419 if (len1 < searchable->splids_len)
420 return -1;
421 if (len1 > searchable->splids_len)
422 return 1;
423
424 SpellingTrie &spl_trie = SpellingTrie::get_instance();
425 uint32 i = 0;
426 for (i = 0; i < len1; i++) {
427 const char py1 = *spl_trie.get_spelling_str(id1[i]);
428 uint16 off = 8 * (i % 4);
429 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
430 if (py1 == py2)
431 continue;
432 if (py1 > py2)
433 return 1;
434 return -1;
435 }
436 return 0;
437 }
438
is_prefix_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)439 bool UserDict::is_prefix_spell_id(
440 const uint16 * fullids, uint16 fulllen,
441 const UserDictSearchable *searchable) {
442 if (fulllen < searchable->splids_len)
443 return false;
444
445 uint32 i = 0;
446 for (; i < searchable->splids_len; i++) {
447 uint16 start_id = searchable->splid_start[i];
448 uint16 count = searchable->splid_count[i];
449 if (fullids[i] >= start_id && fullids[i] < start_id + count)
450 continue;
451 else
452 return false;
453 }
454 return true;
455 }
456
equal_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)457 bool UserDict::equal_spell_id(
458 const uint16 * fullids, uint16 fulllen,
459 const UserDictSearchable *searchable) {
460 if (fulllen != searchable->splids_len)
461 return false;
462
463 uint32 i = 0;
464 for (; i < fulllen; i++) {
465 uint16 start_id = searchable->splid_start[i];
466 uint16 count = searchable->splid_count[i];
467 if (fullids[i] >= start_id && fullids[i] < start_id + count)
468 continue;
469 else
470 return false;
471 }
472 return true;
473 }
474
locate_first_in_offsets(const UserDictSearchable * searchable)475 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
476 int32 begin = 0;
477 int32 end = dict_info_.lemma_count - 1;
478 int32 middle = -1;
479
480 int32 first_prefix = middle;
481 int32 last_matched = middle;
482
483 while (begin <= end) {
484 middle = (begin + end) >> 1;
485 uint32 offset = offsets_[middle];
486 uint8 nchar = get_lemma_nchar(offset);
487 const uint16 * splids = get_lemma_spell_ids(offset);
488 int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
489 int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
490
491 if (pre)
492 first_prefix = middle;
493
494 if (cmp < 0) {
495 begin = middle + 1;
496 } else if (cmp > 0) {
497 end = middle - 1;
498 } else {
499 end = middle - 1;
500 last_matched = middle;
501 }
502 }
503
504 return first_prefix;
505 }
506
prepare_locate(UserDictSearchable * searchable,const uint16 * splid_str,uint16 splid_str_len)507 void UserDict::prepare_locate(UserDictSearchable *searchable,
508 const uint16 *splid_str,
509 uint16 splid_str_len) {
510 searchable->splids_len = splid_str_len;
511 memset(searchable->signature, 0, sizeof(searchable->signature));
512
513 SpellingTrie &spl_trie = SpellingTrie::get_instance();
514 uint32 i = 0;
515 for (; i < splid_str_len; i++) {
516 if (spl_trie.is_half_id(splid_str[i])) {
517 searchable->splid_count[i] =
518 spl_trie.half_to_full(splid_str[i],
519 &(searchable->splid_start[i]));
520 } else {
521 searchable->splid_count[i] = 1;
522 searchable->splid_start[i] = splid_str[i];
523 }
524 const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
525 searchable->signature[i>>2] |= (py << (8 * (i % 4)));
526 }
527 }
528
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max)529 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
530 LmaPsbItem *lpi_items, size_t lpi_max) {
531 return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
532 }
533
_get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max,bool * need_extend)534 size_t UserDict::_get_lpis(const uint16 *splid_str,
535 uint16 splid_str_len, LmaPsbItem *lpi_items,
536 size_t lpi_max, bool * need_extend) {
537 bool tmp_extend;
538 if (!need_extend)
539 need_extend = &tmp_extend;
540
541 *need_extend = false;
542
543 if (is_valid_state() == false)
544 return 0;
545 if (lpi_max <= 0)
546 return 0;
547
548 if (0 == pthread_mutex_trylock(&g_mutex_)) {
549 if (load_time_.tv_sec < g_last_update_.tv_sec ||
550 (load_time_.tv_sec == g_last_update_.tv_sec &&
551 load_time_.tv_usec < g_last_update_.tv_usec)) {
552 // Others updated disk file, have to reload
553 pthread_mutex_unlock(&g_mutex_);
554 flush_cache();
555 } else {
556 pthread_mutex_unlock(&g_mutex_);
557 }
558 } else {
559 }
560
561 UserDictSearchable searchable;
562 prepare_locate(&searchable, splid_str, splid_str_len);
563
564 uint32 max_off = dict_info_.lemma_count;
565 #ifdef ___CACHE_ENABLED___
566 int32 middle;
567 uint32 start, count;
568 bool cached = cache_hit(&searchable, &start, &count);
569 if (cached) {
570 middle = start;
571 max_off = start + count;
572 } else {
573 middle = locate_first_in_offsets(&searchable);
574 start = middle;
575 }
576 #else
577 int32 middle = locate_first_in_offsets(&searchable);
578 #endif
579
580 if (middle == -1) {
581 #ifdef ___CACHE_ENABLED___
582 if (!cached)
583 cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
584 #endif
585 return 0;
586 }
587
588 size_t lpi_current = 0;
589
590 bool fuzzy_break = false;
591 bool prefix_break = false;
592 while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
593 if (lpi_current >= lpi_max)
594 break;
595 uint32 offset = offsets_[middle];
596 // Ignore deleted lemmas
597 if (offset & kUserDictOffsetFlagRemove) {
598 middle++;
599 continue;
600 }
601 uint8 nchar = get_lemma_nchar(offset);
602 uint16 * splids = get_lemma_spell_ids(offset);
603 #ifdef ___CACHE_ENABLED___
604 if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
605 #else
606 if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
607 #endif
608 fuzzy_break = true;
609 }
610
611 if (prefix_break == false) {
612 if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
613 if (*need_extend == false &&
614 is_prefix_spell_id(splids, nchar, &searchable)) {
615 *need_extend = true;
616 }
617 } else {
618 prefix_break = true;
619 }
620 }
621
622 if (equal_spell_id(splids, nchar, &searchable) == true) {
623 lpi_items[lpi_current].psb = translate_score(scores_[middle]);
624 lpi_items[lpi_current].id = ids_[middle];
625 lpi_items[lpi_current].lma_len = nchar;
626 lpi_current++;
627 }
628 middle++;
629 }
630
631 #ifdef ___CACHE_ENABLED___
632 if (!cached) {
633 count = middle - start;
634 cache_push(USER_DICT_CACHE, &searchable, start, count);
635 }
636 #endif
637
638 return lpi_current;
639 }
640
641 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
642 uint16 str_max) {
643 if (is_valid_state() == false)
644 return 0;
645 if (is_valid_lemma_id(id_lemma) == false)
646 return 0;
647 uint32 offset = offsets_by_id_[id_lemma - start_id_];
648 uint8 nchar = get_lemma_nchar(offset);
649 char16 * str = get_lemma_word(offset);
650 uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
651 int i = 0;
652 for (; i < m; i++) {
653 str_buf[i] = str[i];
654 }
655 str_buf[i] = 0;
656 return m;
657 }
658
659 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
660 uint16 splids_max, bool arg_valid) {
661 if (is_valid_lemma_id(id_lemma) == false)
662 return 0;
663 uint32 offset = offsets_by_id_[id_lemma - start_id_];
664 uint8 nchar = get_lemma_nchar(offset);
665 const uint16 * ids = get_lemma_spell_ids(offset);
666 int i = 0;
667 for (; i < nchar && i < splids_max; i++)
668 splids[i] = ids[i];
669 return i;
670 }
671
672 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
673 NPredictItem *npre_items, size_t npre_max,
674 size_t b4_used) {
675 uint32 new_added = 0;
676 #ifdef ___PREDICT_ENABLED___
677 int32 end = dict_info_.lemma_count - 1;
678 int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
679 if (j == -1)
680 return 0;
681
682 while (j <= end) {
683 uint32 offset = predicts_[j];
684 // Ignore deleted lemmas
685 if (offset & kUserDictOffsetFlagRemove) {
686 j++;
687 continue;
688 }
689 uint32 nchar = get_lemma_nchar(offset);
690 uint16 * words = get_lemma_word(offset);
691 uint16 * splids = get_lemma_spell_ids(offset);
692
693 if (nchar <= hzs_len) {
694 j++;
695 continue;
696 }
697
698 if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
699 if (new_added >= npre_max) {
700 return new_added;
701 }
702 uint32 cpy_len =
703 (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
704 - (hzs_len << 1);
705 npre_items[new_added].his_len = hzs_len;
706 npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
707 memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
708 if ((cpy_len >> 1) < kMaxPredictSize) {
709 npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
710 }
711 new_added++;
712 } else {
713 break;
714 }
715
716 j++;
717 }
718 #endif
719 return new_added;
720 }
721
722 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
723 uint16 lemma_len) {
724 int32 max_off = dict_info_.lemma_count;
725
726 UserDictSearchable searchable;
727 prepare_locate(&searchable, splid_str, lemma_len);
728 #ifdef ___CACHE_ENABLED___
729 int32 off;
730 uint32 start, count;
731 bool cached = load_cache(&searchable, &start, &count);
732 if (cached) {
733 off = start;
734 max_off = start + count;
735 } else {
736 off = locate_first_in_offsets(&searchable);
737 start = off;
738 }
739 #else
740 int32 off = locate_first_in_offsets(&searchable);
741 #endif
742
743 if (off == -1) {
744 return off;
745 }
746
747 while (off < max_off) {
748 uint32 offset = offsets_[off];
749 if (offset & kUserDictOffsetFlagRemove) {
750 off++;
751 continue;
752 }
753 uint16 * splids = get_lemma_spell_ids(offset);
754 #ifdef ___CACHE_ENABLED___
755 if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
756 break;
757 #else
758 if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
759 break;
760 #endif
761 if (equal_spell_id(splids, lemma_len, &searchable) == true) {
762 uint16 * str = get_lemma_word(offset);
763 uint32 i = 0;
764 for (i = 0; i < lemma_len; i++) {
765 if (str[i] == lemma_str[i])
766 continue;
767 break;
768 }
769 if (i < lemma_len) {
770 off++;
771 continue;
772 }
773 #ifdef ___CACHE_ENABLED___
774 // No need to save_cache here, since current function is invoked by
775 // put_lemma. It's rarely possible for a user input same lemma twice.
776 // That means first time user type a new lemma, it is newly added into
777 // user dictionary, then it's possible that user type the same lemma
778 // again.
779 // Another reason save_cache can not be invoked here is this function
780 // aborts when lemma is found, and it never knows the count.
781 #endif
782 return off;
783 }
784 off++;
785 }
786
787 return -1;
788 }
789
790 #ifdef ___PREDICT_ENABLED___
791 uint32 UserDict::locate_where_to_insert_in_predicts(
792 const uint16 * words, int lemma_len) {
793 int32 begin = 0;
794 int32 end = dict_info_.lemma_count - 1;
795 int32 middle = end;
796
797 uint32 last_matched = middle;
798
799 while (begin <= end) {
800 middle = (begin + end) >> 1;
801 uint32 offset = offsets_[middle];
802 uint8 nchar = get_lemma_nchar(offset);
803 const uint16 * ws = get_lemma_word(offset);
804
805 uint32 minl = nchar < lemma_len ? nchar : lemma_len;
806 uint32 k = 0;
807 int cmp = 0;
808
809 for (; k < minl; k++) {
810 if (ws[k] < words[k]) {
811 cmp = -1;
812 break;
813 } else if (ws[k] > words[k]) {
814 cmp = 1;
815 break;
816 }
817 }
818 if (cmp == 0) {
819 if (nchar < lemma_len)
820 cmp = -1;
821 else if (nchar > lemma_len)
822 cmp = 1;
823 }
824
825 if (cmp < 0) {
826 begin = middle + 1;
827 last_matched = middle;
828 } else if (cmp > 0) {
829 end = middle - 1;
830 } else {
831 end = middle - 1;
832 last_matched = middle;
833 }
834 }
835
836 return last_matched;
837 }
838
839 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
840 int32 begin = 0;
841 int32 end = dict_info_.lemma_count - 1;
842 int32 middle = -1;
843
844 int32 last_matched = middle;
845
846 while (begin <= end) {
847 middle = (begin + end) >> 1;
848 uint32 offset = offsets_[middle];
849 uint8 nchar = get_lemma_nchar(offset);
850 const uint16 * ws = get_lemma_word(offset);
851
852 uint32 minl = nchar < lemma_len ? nchar : lemma_len;
853 uint32 k = 0;
854 int cmp = 0;
855
856 for (; k < minl; k++) {
857 if (ws[k] < words[k]) {
858 cmp = -1;
859 break;
860 } else if (ws[k] > words[k]) {
861 cmp = 1;
862 break;
863 }
864 }
865 if (cmp == 0) {
866 if (nchar >= lemma_len)
867 last_matched = middle;
868 if (nchar < lemma_len)
869 cmp = -1;
870 else if (nchar > lemma_len)
871 cmp = 1;
872 }
873
874 if (cmp < 0) {
875 begin = middle + 1;
876 } else if (cmp > 0) {
877 end = middle - 1;
878 } else {
879 end = middle - 1;
880 }
881 }
882
883 return last_matched;
884 }
885
886 #endif
887
888 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
889 uint16 lemma_len) {
890 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
891 if (off == -1) {
892 return 0;
893 }
894
895 return ids_[off];
896 }
897
898 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
899 if (is_valid_state() == false)
900 return 0;
901 if (is_valid_lemma_id(lemma_id) == false)
902 return 0;
903
904 return translate_score(_get_lemma_score(lemma_id));
905 }
906
907 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
908 uint16 lemma_len) {
909 if (is_valid_state() == false)
910 return 0;
911 return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
912 }
913
914 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
915 if (is_valid_state() == false)
916 return 0;
917 if (is_valid_lemma_id(lemma_id) == false)
918 return 0;
919
920 uint32 offset = offsets_by_id_[lemma_id - start_id_];
921
922 uint32 nchar = get_lemma_nchar(offset);
923 uint16 * spl = get_lemma_spell_ids(offset);
924 uint16 * wrd = get_lemma_word(offset);
925
926 int32 off = locate_in_offsets(wrd, spl, nchar);
927 if (off == -1) {
928 return 0;
929 }
930
931 return scores_[off];
932 }
933
934 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
935 uint16 lemma_len) {
936 if (is_valid_state() == false)
937 return 0;
938
939 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
940 if (off == -1) {
941 return 0;
942 }
943
944 return scores_[off];
945 }
946
947 #ifdef ___SYNC_ENABLED___
948 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
949 offset &= kUserDictOffsetMask;
950 uint32 i = 0;
951 for (; i < dict_info_.sync_count; i++) {
952 unsigned int off = (syncs_[i] & kUserDictOffsetMask);
953 if (off == offset)
954 break;
955 }
956 if (i < dict_info_.sync_count) {
957 syncs_[i] = syncs_[dict_info_.sync_count - 1];
958 dict_info_.sync_count--;
959 }
960 }
961 #endif
962
963 #ifdef ___PREDICT_ENABLED___
964 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
965 offset &= kUserDictOffsetMask;
966 uint32 i = 0;
967 for (; i < dict_info_.lemma_count; i++) {
968 unsigned int off = (predicts_[i] & kUserDictOffsetMask);
969 if (off == offset) {
970 predicts_[i] |= kUserDictOffsetFlagRemove;
971 break;
972 }
973 }
974 }
975 #endif
976
977 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
978 if (is_valid_state() == false)
979 return 0;
980
981 int32 off = offset_index;
982 if (off == -1) {
983 return false;
984 }
985
986 uint32 offset = offsets_[off];
987 uint32 nchar = get_lemma_nchar(offset);
988
989 offsets_[off] |= kUserDictOffsetFlagRemove;
990
991 #ifdef ___SYNC_ENABLED___
992 // Remove corresponding sync item
993 remove_lemma_from_sync_list(offset);
994 #endif
995
996 #ifdef ___PREDICT_ENABLED___
997 remove_lemma_from_predict_list(offset);
998 #endif
999 dict_info_.free_count++;
1000 dict_info_.free_size += (2 + (nchar << 2));
1001
1002 if (state_ < USER_DICT_OFFSET_DIRTY)
1003 state_ = USER_DICT_OFFSET_DIRTY;
1004 return true;
1005 }
1006
1007 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
1008 if (is_valid_state() == false)
1009 return 0;
1010 if (is_valid_lemma_id(lemma_id) == false)
1011 return false;
1012 uint32 offset = offsets_by_id_[lemma_id - start_id_];
1013
1014 uint32 nchar = get_lemma_nchar(offset);
1015 uint16 * spl = get_lemma_spell_ids(offset);
1016 uint16 * wrd = get_lemma_word(offset);
1017
1018 int32 off = locate_in_offsets(wrd, spl, nchar);
1019
1020 return remove_lemma_by_offset_index(off);
1021 }
1022
1023 void UserDict::flush_cache() {
1024 LemmaIdType start_id = start_id_;
1025 if (!dict_file_)
1026 return;
1027 const char * file = strdup(dict_file_);
1028 if (!file)
1029 return;
1030 close_dict();
1031 load_dict(file, start_id, kUserDictIdEnd);
1032 free((void*)file);
1033 #ifdef ___CACHE_ENABLED___
1034 cache_init();
1035 #endif
1036 return;
1037 }
1038
1039 bool UserDict::reset(const char *file) {
1040 FILE *fp = fopen(file, "w+");
1041 if (!fp) {
1042 return false;
1043 }
1044 uint32 version = kUserDictVersion;
1045 size_t wred = fwrite(&version, 1, 4, fp);
1046 UserDictInfo info;
1047 memset(&info, 0, sizeof(info));
1048 // By default, no limitation for lemma count and size
1049 // thereby, reclaim_ratio is never used
1050 wred += fwrite(&info, 1, sizeof(info), fp);
1051 if (wred != sizeof(info) + sizeof(version)) {
1052 fclose(fp);
1053 unlink(file);
1054 return false;
1055 }
1056 fclose(fp);
1057 return true;
1058 }
1059
1060 bool UserDict::validate(const char *file) {
1061 // b is ignored in POSIX compatible os including Linux
1062 // while b is important flag for Windows to specify binary mode
1063 FILE *fp = fopen(file, "rb");
1064 if (!fp) {
1065 return false;
1066 }
1067
1068 size_t size;
1069 size_t readed;
1070 uint32 version;
1071 UserDictInfo dict_info;
1072
1073 // validate
1074 int err = fseek(fp, 0, SEEK_END);
1075 if (err) {
1076 goto error;
1077 }
1078
1079 size = ftell(fp);
1080 if (size < 4 + sizeof(dict_info)) {
1081 goto error;
1082 }
1083
1084 err = fseek(fp, 0, SEEK_SET);
1085 if (err) {
1086 goto error;
1087 }
1088
1089 readed = fread(&version, 1, sizeof(version), fp);
1090 if (readed < sizeof(version)) {
1091 goto error;
1092 }
1093 if (version != kUserDictVersion) {
1094 goto error;
1095 }
1096
1097 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1098 if (err) {
1099 goto error;
1100 }
1101
1102 readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1103 if (readed != sizeof(dict_info)) {
1104 goto error;
1105 }
1106
1107 if (size != get_dict_file_size(&dict_info)) {
1108 goto error;
1109 }
1110
1111 fclose(fp);
1112 return true;
1113
1114 error:
1115 fclose(fp);
1116 return false;
1117 }
1118
1119 bool UserDict::load(const char *file, LemmaIdType start_id) {
1120 if (0 != pthread_mutex_trylock(&g_mutex_)) {
1121 return false;
1122 }
1123 // b is ignored in POSIX compatible os including Linux
1124 // while b is important flag for Windows to specify binary mode
1125 FILE *fp = fopen(file, "rb");
1126 if (!fp) {
1127 pthread_mutex_unlock(&g_mutex_);
1128 return false;
1129 }
1130
1131 size_t readed, toread;
1132 UserDictInfo dict_info;
1133 uint8 *lemmas = NULL;
1134 uint32 *offsets = NULL;
1135 #ifdef ___SYNC_ENABLED___
1136 uint32 *syncs = NULL;
1137 #endif
1138 uint32 *scores = NULL;
1139 uint32 *ids = NULL;
1140 uint32 *offsets_by_id = NULL;
1141 #ifdef ___PREDICT_ENABLED___
1142 uint32 *predicts = NULL;
1143 #endif
1144 size_t i;
1145 int err;
1146
1147 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1148 if (err) goto error;
1149
1150 readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1151 if (readed != sizeof(dict_info)) goto error;
1152
1153 lemmas = (uint8 *)malloc(
1154 dict_info.lemma_size +
1155 (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1156
1157 if (!lemmas) goto error;
1158
1159 offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1160 if (!offsets) goto error;
1161
1162 #ifdef ___PREDICT_ENABLED___
1163 predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1164 if (!predicts) goto error;
1165 #endif
1166
1167 #ifdef ___SYNC_ENABLED___
1168 syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1169 if (!syncs) goto error;
1170 #endif
1171
1172 scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1173 if (!scores) goto error;
1174
1175 ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1176 if (!ids) goto error;
1177
1178 offsets_by_id = (uint32 *)malloc(
1179 (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1180 if (!offsets_by_id) goto error;
1181
1182 err = fseek(fp, 4, SEEK_SET);
1183 if (err) goto error;
1184
1185 readed = 0;
1186 while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1187 readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1188 }
1189 if (readed < dict_info.lemma_size)
1190 goto error;
1191
1192 toread = (dict_info.lemma_count << 2);
1193 readed = 0;
1194 while (readed < toread && !ferror(fp) && !feof(fp)) {
1195 readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1196 }
1197 if (readed < toread)
1198 goto error;
1199
1200 #ifdef ___PREDICT_ENABLED___
1201 toread = (dict_info.lemma_count << 2);
1202 readed = 0;
1203 while (readed < toread && !ferror(fp) && !feof(fp)) {
1204 readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1205 }
1206 if (readed < toread)
1207 goto error;
1208 #endif
1209
1210 readed = 0;
1211 while (readed < toread && !ferror(fp) && !feof(fp)) {
1212 readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1213 }
1214 if (readed < toread)
1215 goto error;
1216
1217 #ifdef ___SYNC_ENABLED___
1218 toread = (dict_info.sync_count << 2);
1219 readed = 0;
1220 while (readed < toread && !ferror(fp) && !feof(fp)) {
1221 readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1222 }
1223 if (readed < toread)
1224 goto error;
1225 #endif
1226
1227 for (i = 0; i < dict_info.lemma_count; i++) {
1228 ids[i] = start_id + i;
1229 offsets_by_id[i] = offsets[i];
1230 }
1231
1232 lemmas_ = lemmas;
1233 offsets_ = offsets;
1234 #ifdef ___SYNC_ENABLED___
1235 syncs_ = syncs;
1236 sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1237 #endif
1238 offsets_by_id_ = offsets_by_id;
1239 scores_ = scores;
1240 ids_ = ids;
1241 #ifdef ___PREDICT_ENABLED___
1242 predicts_ = predicts;
1243 #endif
1244 lemma_count_left_ = kUserDictPreAlloc;
1245 lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1246 memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1247 state_ = USER_DICT_SYNC;
1248
1249 fclose(fp);
1250
1251 pthread_mutex_unlock(&g_mutex_);
1252 return true;
1253
1254 error:
1255 if (lemmas) free(lemmas);
1256 if (offsets) free(offsets);
1257 #ifdef ___SYNC_ENABLED___
1258 if (syncs) free(syncs);
1259 #endif
1260 if (scores) free(scores);
1261 if (ids) free(ids);
1262 if (offsets_by_id) free(offsets_by_id);
1263 #ifdef ___PREDICT_ENABLED___
1264 if (predicts) free(predicts);
1265 #endif
1266 fclose(fp);
1267 pthread_mutex_unlock(&g_mutex_);
1268 return false;
1269 }
1270
1271 void UserDict::write_back() {
1272 // XXX write back is only allowed from close_dict due to thread-safe sake
1273 if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1274 return;
1275 int fd = open(dict_file_, O_WRONLY);
1276 if (fd == -1)
1277 return;
1278 switch (state_) {
1279 case USER_DICT_DEFRAGMENTED:
1280 write_back_all(fd);
1281 break;
1282 case USER_DICT_LEMMA_DIRTY:
1283 write_back_lemma(fd);
1284 break;
1285 case USER_DICT_OFFSET_DIRTY:
1286 write_back_offset(fd);
1287 break;
1288 case USER_DICT_SCORE_DIRTY:
1289 write_back_score(fd);
1290 break;
1291 #ifdef ___SYNC_ENABLED___
1292 case USER_DICT_SYNC_DIRTY:
1293 write_back_sync(fd);
1294 break;
1295 #endif
1296 default:
1297 break;
1298 }
1299 // It seems truncate is not need on Linux, Windows except Mac
1300 // I am doing it here anyway for safety.
1301 off_t cur = lseek(fd, 0, SEEK_CUR);
1302 #ifndef _WIN32
1303 ftruncate(fd, cur);
1304 #endif
1305 close(fd);
1306 state_ = USER_DICT_SYNC;
1307 }
1308
1309 #ifdef ___SYNC_ENABLED___
1310 void UserDict::write_back_sync(int fd) {
1311 int err = lseek(fd, 4 + dict_info_.lemma_size
1312 + (dict_info_.lemma_count << 3)
1313 #ifdef ___PREDICT_ENABLED___
1314 + (dict_info_.lemma_count << 2)
1315 #endif
1316 , SEEK_SET);
1317 if (err == -1)
1318 return;
1319 write(fd, syncs_, dict_info_.sync_count << 2);
1320 write(fd, &dict_info_, sizeof(dict_info_));
1321 }
1322 #endif
1323
1324 void UserDict::write_back_offset(int fd) {
1325 int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1326 if (err == -1)
1327 return;
1328 write(fd, offsets_, dict_info_.lemma_count << 2);
1329 #ifdef ___PREDICT_ENABLED___
1330 write(fd, predicts_, dict_info_.lemma_count << 2);
1331 #endif
1332 write(fd, scores_, dict_info_.lemma_count << 2);
1333 #ifdef ___SYNC_ENABLED___
1334 write(fd, syncs_, dict_info_.sync_count << 2);
1335 #endif
1336 write(fd, &dict_info_, sizeof(dict_info_));
1337 }
1338
1339 void UserDict::write_back_score(int fd) {
1340 int err = lseek(fd, 4 + dict_info_.lemma_size
1341 + (dict_info_.lemma_count << 2)
1342 #ifdef ___PREDICT_ENABLED___
1343 + (dict_info_.lemma_count << 2)
1344 #endif
1345 , SEEK_SET);
1346 if (err == -1)
1347 return;
1348 write(fd, scores_, dict_info_.lemma_count << 2);
1349 #ifdef ___SYNC_ENABLED___
1350 write(fd, syncs_, dict_info_.sync_count << 2);
1351 #endif
1352 write(fd, &dict_info_, sizeof(dict_info_));
1353 }
1354
1355 void UserDict::write_back_lemma(int fd) {
1356 int err = lseek(fd, 4, SEEK_SET);
1357 if (err == -1)
1358 return;
1359 // New lemmas are always appended, no need to write whole lemma block
1360 size_t need_write = kUserDictPreAlloc *
1361 (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1362 err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1363 if (err == -1)
1364 return;
1365 write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1366
1367 write(fd, offsets_, dict_info_.lemma_count << 2);
1368 #ifdef ___PREDICT_ENABLED___
1369 write(fd, predicts_, dict_info_.lemma_count << 2);
1370 #endif
1371 write(fd, scores_, dict_info_.lemma_count << 2);
1372 #ifdef ___SYNC_ENABLED___
1373 write(fd, syncs_, dict_info_.sync_count << 2);
1374 #endif
1375 write(fd, &dict_info_, sizeof(dict_info_));
1376 }
1377
1378 void UserDict::write_back_all(int fd) {
1379 // XXX lemma_size is handled differently in writeall
1380 // and writelemma. I update lemma_size and lemma_count in different
1381 // places for these two cases. Should fix it to make it consistent.
1382 int err = lseek(fd, 4, SEEK_SET);
1383 if (err == -1)
1384 return;
1385 write(fd, lemmas_, dict_info_.lemma_size);
1386 write(fd, offsets_, dict_info_.lemma_count << 2);
1387 #ifdef ___PREDICT_ENABLED___
1388 write(fd, predicts_, dict_info_.lemma_count << 2);
1389 #endif
1390 write(fd, scores_, dict_info_.lemma_count << 2);
1391 #ifdef ___SYNC_ENABLED___
1392 write(fd, syncs_, dict_info_.sync_count << 2);
1393 #endif
1394 write(fd, &dict_info_, sizeof(dict_info_));
1395 }
1396
1397 #ifdef ___CACHE_ENABLED___
1398 bool UserDict::load_cache(UserDictSearchable *searchable,
1399 uint32 *offset, uint32 *length) {
1400 UserDictCache *cache = &caches_[searchable->splids_len - 1];
1401 if (cache->head == cache->tail)
1402 return false;
1403
1404 uint16 j, sig_len = kMaxLemmaSize / 4;
1405 uint16 i = cache->head;
1406 while (1) {
1407 j = 0;
1408 for (; j < sig_len; j++) {
1409 if (cache->signatures[i][j] != searchable->signature[j])
1410 break;
1411 }
1412 if (j < sig_len) {
1413 i++;
1414 if (i >= kUserDictCacheSize)
1415 i -= kUserDictCacheSize;
1416 if (i == cache->tail)
1417 break;
1418 continue;
1419 }
1420 *offset = cache->offsets[i];
1421 *length = cache->lengths[i];
1422 return true;
1423 }
1424 return false;
1425 }
1426
1427 void UserDict::save_cache(UserDictSearchable *searchable,
1428 uint32 offset, uint32 length) {
1429 UserDictCache *cache = &caches_[searchable->splids_len - 1];
1430 uint16 next = cache->tail;
1431
1432 cache->offsets[next] = offset;
1433 cache->lengths[next] = length;
1434 uint16 sig_len = kMaxLemmaSize / 4;
1435 uint16 j = 0;
1436 for (; j < sig_len; j++) {
1437 cache->signatures[next][j] = searchable->signature[j];
1438 }
1439
1440 if (++next >= kUserDictCacheSize) {
1441 next -= kUserDictCacheSize;
1442 }
1443 if (next == cache->head) {
1444 cache->head++;
1445 if (cache->head >= kUserDictCacheSize) {
1446 cache->head -= kUserDictCacheSize;
1447 }
1448 }
1449 cache->tail = next;
1450 }
1451
1452 void UserDict::reset_cache() {
1453 memset(caches_, 0, sizeof(caches_));
1454 }
1455
1456 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1457 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1458 if (cache->head == cache->tail)
1459 return false;
1460
1461 uint16 j, sig_len = kMaxLemmaSize / 4;
1462 uint16 i = cache->head;
1463 while (1) {
1464 j = 0;
1465 for (; j < sig_len; j++) {
1466 if (cache->signatures[i][j] != searchable->signature[j])
1467 break;
1468 }
1469 if (j < sig_len) {
1470 i++;
1471 if (i >= kUserDictMissCacheSize)
1472 i -= kUserDictMissCacheSize;
1473 if (i == cache->tail)
1474 break;
1475 continue;
1476 }
1477 return true;
1478 }
1479 return false;
1480 }
1481
1482 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1483 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1484 uint16 next = cache->tail;
1485
1486 uint16 sig_len = kMaxLemmaSize / 4;
1487 uint16 j = 0;
1488 for (; j < sig_len; j++) {
1489 cache->signatures[next][j] = searchable->signature[j];
1490 }
1491
1492 if (++next >= kUserDictMissCacheSize) {
1493 next -= kUserDictMissCacheSize;
1494 }
1495 if (next == cache->head) {
1496 cache->head++;
1497 if (cache->head >= kUserDictMissCacheSize) {
1498 cache->head -= kUserDictMissCacheSize;
1499 }
1500 }
1501 cache->tail = next;
1502 }
1503
1504 void UserDict::reset_miss_cache() {
1505 memset(miss_caches_, 0, sizeof(miss_caches_));
1506 }
1507
1508 void UserDict::cache_init() {
1509 reset_cache();
1510 reset_miss_cache();
1511 }
1512
1513 bool UserDict::cache_hit(UserDictSearchable *searchable,
1514 uint32 *offset, uint32 *length) {
1515 bool hit = load_miss_cache(searchable);
1516 if (hit) {
1517 *offset = 0;
1518 *length = 0;
1519 return true;
1520 }
1521 hit = load_cache(searchable, offset, length);
1522 if (hit) {
1523 return true;
1524 }
1525 return false;
1526 }
1527
1528 void UserDict::cache_push(UserDictCacheType type,
1529 UserDictSearchable *searchable,
1530 uint32 offset, uint32 length) {
1531 switch (type) {
1532 case USER_DICT_MISS_CACHE:
1533 save_miss_cache(searchable);
1534 break;
1535 case USER_DICT_CACHE:
1536 save_cache(searchable, offset, length);
1537 break;
1538 default:
1539 break;
1540 }
1541 }
1542
1543 #endif
1544
1545 void UserDict::defragment(void) {
1546 #ifdef ___DEBUG_PERF___
1547 DEBUG_PERF_BEGIN;
1548 #endif
1549 if (is_valid_state() == false)
1550 return;
1551 // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1552 size_t first_freed = 0;
1553 size_t first_inuse = 0;
1554 while (first_freed < dict_info_.lemma_count) {
1555 // Find first freed offset
1556 while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1557 first_freed < dict_info_.lemma_count) {
1558 first_freed++;
1559 }
1560 if (first_freed < dict_info_.lemma_count) {
1561 // Save REMOVE flag to lemma flag
1562 int off = offsets_[first_freed];
1563 set_lemma_flag(off, kUserDictLemmaFlagRemove);
1564 } else {
1565 break;
1566 }
1567 // Find first inuse offse after first_freed
1568 first_inuse = first_freed + 1;
1569 while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1570 (first_inuse < dict_info_.lemma_count)) {
1571 // Save REMOVE flag to lemma flag
1572 int off = offsets_[first_inuse];
1573 set_lemma_flag(off, kUserDictLemmaFlagRemove);
1574 first_inuse++;
1575 }
1576 if (first_inuse >= dict_info_.lemma_count) {
1577 break;
1578 }
1579 // Swap offsets_
1580 int tmp = offsets_[first_inuse];
1581 offsets_[first_inuse] = offsets_[first_freed];
1582 offsets_[first_freed] = tmp;
1583 // Move scores_, no need to swap
1584 tmp = scores_[first_inuse];
1585 scores_[first_inuse] = scores_[first_freed];
1586 scores_[first_freed] = tmp;
1587 // Swap ids_
1588 LemmaIdType tmpid = ids_[first_inuse];
1589 ids_[first_inuse] = ids_[first_freed];
1590 ids_[first_freed] = tmpid;
1591 // Go on
1592 first_freed++;
1593 }
1594 #ifdef ___PREDICT_ENABLED___
1595 // Fixup predicts_
1596 first_freed = 0;
1597 first_inuse = 0;
1598 while (first_freed < dict_info_.lemma_count) {
1599 // Find first freed offset
1600 while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1601 first_freed < dict_info_.lemma_count) {
1602 first_freed++;
1603 }
1604 if (first_freed >= dict_info_.lemma_count)
1605 break;
1606 // Find first inuse offse after first_freed
1607 first_inuse = first_freed + 1;
1608 while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1609 && (first_inuse < dict_info_.lemma_count)) {
1610 first_inuse++;
1611 }
1612 if (first_inuse >= dict_info_.lemma_count) {
1613 break;
1614 }
1615 // Swap offsets_
1616 int tmp = predicts_[first_inuse];
1617 predicts_[first_inuse] = predicts_[first_freed];
1618 predicts_[first_freed] = tmp;
1619 // Go on
1620 first_freed++;
1621 }
1622 #endif
1623 dict_info_.lemma_count = first_freed;
1624 // Fixup lemmas_
1625 size_t begin = 0;
1626 size_t end = 0;
1627 size_t dst = 0;
1628 int total_size = dict_info_.lemma_size + lemma_size_left_;
1629 int total_count = dict_info_.lemma_count + lemma_count_left_;
1630 size_t real_size = total_size - lemma_size_left_;
1631 while (dst < real_size) {
1632 unsigned char flag = get_lemma_flag(dst);
1633 unsigned char nchr = get_lemma_nchar(dst);
1634 if ((flag & kUserDictLemmaFlagRemove) == 0) {
1635 dst += nchr * 4 + 2;
1636 continue;
1637 }
1638 break;
1639 }
1640 if (dst >= real_size)
1641 return;
1642
1643 end = dst;
1644 while (end < real_size) {
1645 begin = end + get_lemma_nchar(end) * 4 + 2;
1646 repeat:
1647 // not used any more
1648 if (begin >= real_size)
1649 break;
1650 unsigned char flag = get_lemma_flag(begin);
1651 unsigned char nchr = get_lemma_nchar(begin);
1652 if (flag & kUserDictLemmaFlagRemove) {
1653 begin += nchr * 4 + 2;
1654 goto repeat;
1655 }
1656 end = begin + nchr * 4 + 2;
1657 while (end < real_size) {
1658 unsigned char eflag = get_lemma_flag(end);
1659 unsigned char enchr = get_lemma_nchar(end);
1660 if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1661 end += enchr * 4 + 2;
1662 continue;
1663 }
1664 break;
1665 }
1666 memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1667 for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1668 if (offsets_[j] >= begin && offsets_[j] < end) {
1669 offsets_[j] -= (begin - dst);
1670 offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1671 }
1672 #ifdef ___PREDICT_ENABLED___
1673 if (predicts_[j] >= begin && predicts_[j] < end) {
1674 predicts_[j] -= (begin - dst);
1675 }
1676 #endif
1677 }
1678 #ifdef ___SYNC_ENABLED___
1679 for (size_t j = 0; j < dict_info_.sync_count; j++) {
1680 if (syncs_[j] >= begin && syncs_[j] < end) {
1681 syncs_[j] -= (begin - dst);
1682 }
1683 }
1684 #endif
1685 dst += (end - begin);
1686 }
1687
1688 dict_info_.free_count = 0;
1689 dict_info_.free_size = 0;
1690 dict_info_.lemma_size = dst;
1691 lemma_size_left_ = total_size - dict_info_.lemma_size;
1692 lemma_count_left_ = total_count - dict_info_.lemma_count;
1693
1694 // XXX Without following code,
1695 // offsets_by_id_ is not reordered.
1696 // That's to say, all removed lemmas' ids are not collected back.
1697 // There may not be room for addition of new lemmas due to
1698 // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1699 // By default, we do want defrag as fast as possible, because
1700 // during defrag procedure, other peers can not write new lemmas
1701 // to user dictionary file.
1702 // XXX If write-back is invoked immediately after
1703 // this defragment, no need to fix up following in-mem data.
1704 for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1705 ids_[i] = start_id_ + i;
1706 offsets_by_id_[i] = offsets_[i];
1707 }
1708
1709 state_ = USER_DICT_DEFRAGMENTED;
1710
1711 #ifdef ___DEBUG_PERF___
1712 DEBUG_PERF_END;
1713 LOGD_PERF("defragment");
1714 #endif
1715 }
1716
1717 #ifdef ___SYNC_ENABLED___
1718 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1719 if (is_valid_state() == false)
1720 return;
1721 if (end > dict_info_.sync_count)
1722 end = dict_info_.sync_count;
1723 memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1724 dict_info_.sync_count -= (end - start);
1725 if (state_ < USER_DICT_SYNC_DIRTY)
1726 state_ = USER_DICT_SYNC_DIRTY;
1727 }
1728
1729 int UserDict::get_sync_count() {
1730 if (is_valid_state() == false)
1731 return 0;
1732 return dict_info_.sync_count;
1733 }
1734
1735 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1736 uint16 lemma_len, uint16 count, uint64 lmt) {
1737 int again = 0;
1738 begin:
1739 LemmaIdType id;
1740 uint32 * syncs_bak = syncs_;
1741 syncs_ = NULL;
1742 id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1743 syncs_ = syncs_bak;
1744 if (id == 0 && again == 0) {
1745 if ((dict_info_.limit_lemma_count > 0 &&
1746 dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1747 || (dict_info_.limit_lemma_size > 0 &&
1748 dict_info_.lemma_size + (2 + (lemma_len << 2))
1749 > dict_info_.limit_lemma_size)) {
1750 // XXX Always reclaim and defrag in sync code path
1751 // sync thread is background thread and ok with heavy work
1752 reclaim();
1753 defragment();
1754 flush_cache();
1755 again = 1;
1756 goto begin;
1757 }
1758 }
1759 return id;
1760 }
1761
1762 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1763 int newly_added = 0;
1764
1765 SpellingParser * spl_parser = new SpellingParser();
1766 if (!spl_parser) {
1767 return 0;
1768 }
1769 #ifdef ___DEBUG_PERF___
1770 DEBUG_PERF_BEGIN;
1771 #endif
1772 char16 *ptr = lemmas;
1773
1774 // Extract pinyin,words,frequence,last_mod_time
1775 char16 * p = ptr, * py16 = ptr;
1776 char16 * hz16 = NULL;
1777 int py16_len = 0;
1778 uint16 splid[kMaxLemmaSize];
1779 int splid_len = 0;
1780 int hz16_len = 0;
1781 char16 * fr16 = NULL;
1782 int fr16_len = 0;
1783
1784 while (p - ptr < len) {
1785 // Pinyin
1786 py16 = p;
1787 splid_len = 0;
1788 while (*p != 0x2c && (p - ptr) < len) {
1789 if (*p == 0x20)
1790 splid_len++;
1791 p++;
1792 }
1793 splid_len++;
1794 if (p - ptr == len)
1795 break;
1796 py16_len = p - py16;
1797 if (kMaxLemmaSize < splid_len) {
1798 break;
1799 }
1800 bool is_pre;
1801 int splidl = spl_parser->splstr16_to_idxs_f(
1802 py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1803 if (splidl != splid_len)
1804 break;
1805 // Phrase
1806 hz16 = ++p;
1807 while (*p != 0x2c && (p - ptr) < len) {
1808 p++;
1809 }
1810 hz16_len = p - hz16;
1811 if (hz16_len != splid_len)
1812 break;
1813 // Frequency
1814 fr16 = ++p;
1815 fr16_len = 0;
1816 while (*p != 0x2c && (p - ptr) < len) {
1817 p++;
1818 }
1819 fr16_len = p - fr16;
1820 uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1821 // Last modified time
1822 fr16 = ++p;
1823 fr16_len = 0;
1824 while (*p != 0x3b && (p - ptr) < len) {
1825 p++;
1826 }
1827 fr16_len = p - fr16;
1828 uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1829
1830 put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1831 newly_added++;
1832
1833 p++;
1834 }
1835
1836 #ifdef ___DEBUG_PERF___
1837 DEBUG_PERF_END;
1838 LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1839 #endif
1840 return newly_added;
1841 }
1842
1843 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1844 char16 * str, int size, int * count) {
1845 int len = 0;
1846 *count = 0;
1847
1848 int left_len = size;
1849
1850 if (is_valid_state() == false)
1851 return len;
1852
1853 SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1854 if (!spl_trie) {
1855 return 0;
1856 }
1857
1858 uint32 i;
1859 for (i = 0; i < dict_info_.sync_count; i++) {
1860 int offset = syncs_[i];
1861 uint32 nchar = get_lemma_nchar(offset);
1862 uint16 *spl = get_lemma_spell_ids(offset);
1863 uint16 *wrd = get_lemma_word(offset);
1864 int score = _get_lemma_score(wrd, spl, nchar);
1865
1866 static char score_temp[32], *pscore_temp = score_temp;
1867 static char16 temp[256], *ptemp = temp;
1868
1869 pscore_temp = score_temp;
1870 ptemp = temp;
1871
1872 uint32 j;
1873 // Add pinyin
1874 for (j = 0; j < nchar; j++) {
1875 int ret_len = spl_trie->get_spelling_str16(
1876 spl[j], ptemp, temp + sizeof(temp) - ptemp);
1877 if (ret_len <= 0)
1878 break;
1879 ptemp += ret_len;
1880 if (ptemp < temp + sizeof(temp) - 1) {
1881 *(ptemp++) = ' ';
1882 } else {
1883 j = 0;
1884 break;
1885 }
1886 }
1887 if (j < nchar) {
1888 continue;
1889 }
1890 ptemp--;
1891 if (ptemp < temp + sizeof(temp) - 1) {
1892 *(ptemp++) = ',';
1893 } else {
1894 continue;
1895 }
1896 // Add phrase
1897 for (j = 0; j < nchar; j++) {
1898 if (ptemp < temp + sizeof(temp) - 1) {
1899 *(ptemp++) = wrd[j];
1900 } else {
1901 break;
1902 }
1903 }
1904 if (j < nchar) {
1905 continue;
1906 }
1907 if (ptemp < temp + sizeof(temp) - 1) {
1908 *(ptemp++) = ',';
1909 } else {
1910 continue;
1911 }
1912 // Add frequency
1913 uint32 intf = extract_score_freq(score);
1914 int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1915 if (ret_len <= 0)
1916 continue;
1917 ptemp += ret_len;
1918 if (ptemp < temp + sizeof(temp) - 1) {
1919 *(ptemp++) = ',';
1920 } else {
1921 continue;
1922 }
1923 // Add last modified time
1924 uint64 last_mod = extract_score_lmt(score);
1925 ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1926 if (ret_len <= 0)
1927 continue;
1928 ptemp += ret_len;
1929 if (ptemp < temp + sizeof(temp) - 1) {
1930 *(ptemp++) = ';';
1931 } else {
1932 continue;
1933 }
1934
1935 // Write to string
1936 int need_len = ptemp - temp;
1937 if (need_len > left_len)
1938 break;
1939 memcpy(str + len, temp, need_len * 2);
1940 left_len -= need_len;
1941
1942 len += need_len;
1943 (*count)++;
1944 }
1945
1946 if (len > 0) {
1947 if (state_ < USER_DICT_SYNC_DIRTY)
1948 state_ = USER_DICT_SYNC_DIRTY;
1949 }
1950 return len;
1951 }
1952
1953 #endif
1954
1955 bool UserDict::state(UserDictStat * stat) {
1956 if (is_valid_state() == false)
1957 return false;
1958 if (!stat)
1959 return false;
1960 stat->version = version_;
1961 stat->file_name = dict_file_;
1962 stat->load_time.tv_sec = load_time_.tv_sec;
1963 stat->load_time.tv_usec = load_time_.tv_usec;
1964 pthread_mutex_lock(&g_mutex_);
1965 stat->last_update.tv_sec = g_last_update_.tv_sec;
1966 stat->last_update.tv_usec = g_last_update_.tv_usec;
1967 pthread_mutex_unlock(&g_mutex_);
1968 stat->disk_size = get_dict_file_size(&dict_info_);
1969 stat->lemma_count = dict_info_.lemma_count;
1970 stat->lemma_size = dict_info_.lemma_size;
1971 stat->delete_count = dict_info_.free_count;
1972 stat->delete_size = dict_info_.free_size;
1973 #ifdef ___SYNC_ENABLED___
1974 stat->sync_count = dict_info_.sync_count;
1975 #endif
1976 stat->limit_lemma_count = dict_info_.limit_lemma_count;
1977 stat->limit_lemma_size = dict_info_.limit_lemma_size;
1978 stat->reclaim_ratio = dict_info_.reclaim_ratio;
1979 return true;
1980 }
1981
1982 void UserDict::set_limit(uint32 max_lemma_count,
1983 uint32 max_lemma_size, uint32 reclaim_ratio) {
1984 dict_info_.limit_lemma_count = max_lemma_count;
1985 dict_info_.limit_lemma_size = max_lemma_size;
1986 if (reclaim_ratio > 100)
1987 reclaim_ratio = 100;
1988 dict_info_.reclaim_ratio = reclaim_ratio;
1989 }
1990
1991 void UserDict::reclaim() {
1992 if (is_valid_state() == false)
1993 return;
1994
1995 switch (dict_info_.reclaim_ratio) {
1996 case 0:
1997 return;
1998 case 100:
1999 // TODO: CLEAR to be implemented
2000 assert(false);
2001 return;
2002 default:
2003 break;
2004 }
2005
2006 // XXX Reclaim is only based on count, not size
2007 uint32 count = dict_info_.lemma_count;
2008 int rc = count * dict_info_.reclaim_ratio / 100;
2009
2010 UserDictScoreOffsetPair * score_offset_pairs = NULL;
2011 score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
2012 sizeof(UserDictScoreOffsetPair) * rc);
2013 if (score_offset_pairs == NULL) {
2014 return;
2015 }
2016
2017 for (int i = 0; i < rc; i++) {
2018 int s = scores_[i];
2019 score_offset_pairs[i].score = s;
2020 score_offset_pairs[i].offset_index = i;
2021 }
2022
2023 for (int i = (rc + 1) / 2; i >= 0; i--)
2024 shift_down(score_offset_pairs, i, rc);
2025
2026 for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
2027 int s = scores_[i];
2028 if (s < score_offset_pairs[0].score) {
2029 score_offset_pairs[0].score = s;
2030 score_offset_pairs[0].offset_index = i;
2031 shift_down(score_offset_pairs, 0, rc);
2032 }
2033 }
2034
2035 for (int i = 0; i < rc; i++) {
2036 int off = score_offset_pairs[i].offset_index;
2037 remove_lemma_by_offset_index(off);
2038 }
2039 if (rc > 0) {
2040 if (state_ < USER_DICT_OFFSET_DIRTY)
2041 state_ = USER_DICT_OFFSET_DIRTY;
2042 }
2043
2044 free(score_offset_pairs);
2045 }
2046
2047 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2048 int s = sop[i].score;
2049 int p = sop[i].offset_index;
2050 sop[i].score = sop[j].score;
2051 sop[i].offset_index = sop[j].offset_index;
2052 sop[j].score = s;
2053 sop[j].offset_index = p;
2054 }
2055
2056 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2057 int par = i;
2058 while (par < n) {
2059 int left = par * 2 + 1;
2060 int right = left + 1;
2061 if (left >= n && right >= n)
2062 break;
2063 if (right >= n) {
2064 if (sop[left].score > sop[par].score) {
2065 swap(sop, left, par);
2066 par = left;
2067 continue;
2068 }
2069 } else if (sop[left].score > sop[right].score &&
2070 sop[left].score > sop[par].score) {
2071 swap(sop, left, par);
2072 par = left;
2073 continue;
2074 } else if (sop[right].score > sop[left].score &&
2075 sop[right].score > sop[par].score) {
2076 swap(sop, right, par);
2077 par = right;
2078 continue;
2079 }
2080 break;
2081 }
2082 }
2083
2084 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2085 uint16 lemma_len, uint16 count) {
2086 return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2087 }
2088
2089 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2090 uint16 lemma_len, uint16 count, uint64 lmt) {
2091 #ifdef ___DEBUG_PERF___
2092 DEBUG_PERF_BEGIN;
2093 #endif
2094 if (is_valid_state() == false)
2095 return 0;
2096 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2097 if (off != -1) {
2098 int delta_score = count - scores_[off];
2099 dict_info_.total_nfreq += delta_score;
2100 scores_[off] = build_score(lmt, count);
2101 if (state_ < USER_DICT_SCORE_DIRTY)
2102 state_ = USER_DICT_SCORE_DIRTY;
2103 #ifdef ___DEBUG_PERF___
2104 DEBUG_PERF_END;
2105 LOGD_PERF("_put_lemma(update)");
2106 #endif
2107 return ids_[off];
2108 } else {
2109 if ((dict_info_.limit_lemma_count > 0 &&
2110 dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2111 || (dict_info_.limit_lemma_size > 0 &&
2112 dict_info_.lemma_size + (2 + (lemma_len << 2))
2113 > dict_info_.limit_lemma_size)) {
2114 // XXX Don't defragment here, it's too time-consuming.
2115 return 0;
2116 }
2117 int flushed = 0;
2118 if (lemma_count_left_ == 0 ||
2119 lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2120
2121 // XXX When there is no space for new lemma, we flush to disk
2122 // flush_cache() may be called by upper user
2123 // and better place shoule be found instead of here
2124 flush_cache();
2125 flushed = 1;
2126 // Or simply return and do nothing
2127 // return 0;
2128 }
2129 #ifdef ___DEBUG_PERF___
2130 DEBUG_PERF_END;
2131 LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2132 #endif
2133 LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2134 #ifdef ___SYNC_ENABLED___
2135 if (syncs_ && id != 0) {
2136 queue_lemma_for_sync(id);
2137 }
2138 #endif
2139 return id;
2140 }
2141 return 0;
2142 }
2143
2144 #ifdef ___SYNC_ENABLED___
2145 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2146 if (dict_info_.sync_count < sync_count_size_) {
2147 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2148 } else {
2149 uint32 * syncs = (uint32*)realloc(
2150 syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2151 if (syncs) {
2152 sync_count_size_ += kUserDictPreAlloc;
2153 syncs_ = syncs;
2154 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2155 }
2156 }
2157 }
2158 #endif
2159
2160 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2161 bool selected) {
2162 #ifdef ___DEBUG_PERF___
2163 DEBUG_PERF_BEGIN;
2164 #endif
2165 if (is_valid_state() == false)
2166 return 0;
2167 if (is_valid_lemma_id(lemma_id) == false)
2168 return 0;
2169 uint32 offset = offsets_by_id_[lemma_id - start_id_];
2170 uint8 lemma_len = get_lemma_nchar(offset);
2171 char16 * lemma_str = get_lemma_word(offset);
2172 uint16 * splids = get_lemma_spell_ids(offset);
2173
2174 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2175 if (off != -1) {
2176 int score = scores_[off];
2177 int count = extract_score_freq(score);
2178 uint64 lmt = extract_score_lmt(score);
2179 if (count + delta_count > kUserDictMaxFrequency ||
2180 count + delta_count < count) {
2181 delta_count = kUserDictMaxFrequency - count;
2182 }
2183 count += delta_count;
2184 dict_info_.total_nfreq += delta_count;
2185 if (selected) {
2186 lmt = time(NULL);
2187 }
2188 scores_[off] = build_score(lmt, count);
2189 if (state_ < USER_DICT_SCORE_DIRTY)
2190 state_ = USER_DICT_SCORE_DIRTY;
2191 #ifdef ___DEBUG_PERF___
2192 DEBUG_PERF_END;
2193 LOGD_PERF("update_lemma");
2194 #endif
2195 #ifdef ___SYNC_ENABLED___
2196 queue_lemma_for_sync(ids_[off]);
2197 #endif
2198 return ids_[off];
2199 }
2200 return 0;
2201 }
2202
2203 size_t UserDict::get_total_lemma_count() {
2204 return dict_info_.total_nfreq;
2205 }
2206
2207 void UserDict::set_total_lemma_count_of_others(size_t count) {
2208 total_other_nfreq_ = count;
2209 }
2210
2211 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2212 uint16 lemma_len, uint16 count, uint64 lmt) {
2213 LemmaIdType id = get_max_lemma_id() + 1;
2214 size_t offset = dict_info_.lemma_size;
2215 if (offset > kUserDictOffsetMask)
2216 return 0;
2217
2218 lemmas_[offset] = 0;
2219 lemmas_[offset + 1] = (uint8)lemma_len;
2220 for (size_t i = 0; i < lemma_len; i++) {
2221 *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2222 *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2223 = lemma_str[i];
2224 }
2225 uint32 off = dict_info_.lemma_count;
2226 offsets_[off] = offset;
2227 scores_[off] = build_score(lmt, count);
2228 ids_[off] = id;
2229 #ifdef ___PREDICT_ENABLED___
2230 predicts_[off] = offset;
2231 #endif
2232
2233 offsets_by_id_[id - start_id_] = offset;
2234
2235 dict_info_.lemma_count++;
2236 dict_info_.lemma_size += (2 + (lemma_len << 2));
2237 lemma_count_left_--;
2238 lemma_size_left_ -= (2 + (lemma_len << 2));
2239
2240 // Sort
2241
2242 UserDictSearchable searchable;
2243 prepare_locate(&searchable, splids, lemma_len);
2244
2245 size_t i = 0;
2246 while (i < off) {
2247 offset = offsets_[i];
2248 uint32 nchar = get_lemma_nchar(offset);
2249 uint16 * spl = get_lemma_spell_ids(offset);
2250
2251 if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2252 break;
2253 i++;
2254 }
2255 if (i != off) {
2256 uint32 temp = offsets_[off];
2257 memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2258 offsets_[i] = temp;
2259
2260 temp = scores_[off];
2261 memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2262 scores_[i] = temp;
2263
2264 temp = ids_[off];
2265 memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2266 ids_[i] = temp;
2267 }
2268
2269 #ifdef ___PREDICT_ENABLED___
2270 uint32 j = 0;
2271 uint16 * words_new = get_lemma_word(predicts_[off]);
2272 j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2273 if (j != off) {
2274 uint32 temp = predicts_[off];
2275 memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2276 predicts_[j] = temp;
2277 }
2278 #endif
2279
2280 if (state_ < USER_DICT_LEMMA_DIRTY)
2281 state_ = USER_DICT_LEMMA_DIRTY;
2282
2283 #ifdef ___CACHE_ENABLED___
2284 cache_init();
2285 #endif
2286
2287 dict_info_.total_nfreq += count;
2288 return id;
2289 }
2290 }
2291