xref: /OK3568_Linux_fs/app/forlinx/flapp/src/keyboard/pinyin/share/userdict.cpp (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "../include/userdict.h"
18 #include "../include/splparser.h"
19 #include "../include/ngram.h"
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #ifdef ___DEBUG_PERF___
24 #include <cutils/log.h>
25 #endif
26 #ifdef _WIN32
27 #include <io.h>
28 #else
29 #include <unistd.h>
30 #endif
31 #include <fcntl.h>
32 #include <sys/stat.h>
33 #include <assert.h>
34 #include <ctype.h>
35 #include <sys/types.h>
36 #ifndef _WIN32
37 #include <sys/time.h>
38 #endif
39 #include <time.h>
40 #ifdef _WIN32
41 #undef max
42 #undef min
43 #include <QDateTime>
44 #include <QMutex>
45 #else
46 #include <pthread.h>
47 #endif
48 #include <math.h>
49 
50 namespace ime_pinyin {
51 
52 #ifdef _WIN32
gettimeofday(struct timeval * tp,void *)53 static int gettimeofday(struct timeval *tp, void *) {
54     const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch();
55     tp->tv_sec = (long)(current_msecs_since_epoch / 1000);
56     tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000);
57     return 0;
58 }
59 #endif
60 
61 #ifdef ___DEBUG_PERF___
62 static uint64 _ellapse_ = 0;
63 static struct timeval _tv_start_, _tv_end_;
64 #define DEBUG_PERF_BEGIN \
65     do { \
66       gettimeofday(&_tv_start_, NULL); \
67     } while (0)
68 #define DEBUG_PERF_END \
69     do { \
70       gettimeofday(&_tv_end_, NULL); \
71       _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
72                   (_tv_end_.tv_usec - _tv_start_.tv_usec); \
73     } while (0)
74 #define LOGD_PERF(message) \
75     ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
76 #else
77 #define DEBUG_PERF_BEGIN
78 #define DEBUG_PERF_END
79 #define LOGD_PERF(message)
80 #endif
81 
82 // XXX File load and write are thread-safe by g_mutex_
83 #ifdef _WIN32
84 static QMutex g_mutex_;
85 #define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
86 #define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
87 #define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0))
88 #else
89 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
90 #endif
91 static struct timeval g_last_update_ = {0, 0};
92 
get_dict_file_size(UserDictInfo * info)93 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
94   return (4 + info->lemma_size + (info->lemma_count << 3)
95 #ifdef ___PREDICT_ENABLED___
96           + (info->lemma_count << 2)
97 #endif
98 #ifdef ___SYNC_ENABLED___
99           + (info->sync_count << 2)
100 #endif
101           + sizeof(*info));
102 }
103 
translate_score(int raw_score)104 inline LmaScoreType UserDict::translate_score(int raw_score) {
105   // 1) ori_freq: original user frequency
106   uint32 ori_freq = extract_score_freq(raw_score);
107   // 2) lmt_off: lmt index (week offset for example)
108   uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
109   if (kUserDictLMTBitWidth < 16) {
110     uint64 mask = ~(1 << kUserDictLMTBitWidth);
111     lmt_off &= mask;
112   }
113   // 3) now_off: current time index (current week offset for example)
114   // assuming load_time_ is around current time
115   uint64 now_off = load_time_.tv_sec;
116   now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
117   now_off = (now_off << (64 - kUserDictLMTBitWidth));
118   now_off = (now_off >> (64 - kUserDictLMTBitWidth));
119   // 4) factor: decide expand-factor
120   int delta = now_off - lmt_off;
121   if (delta > 4)
122     delta = 4;
123   int factor = 80 - (delta << 4);
124 
125   double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
126   return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
127                         * NGram::kLogValueAmplifier);
128 }
129 
extract_score_freq(int raw_score)130 inline int UserDict::extract_score_freq(int raw_score) {
131   // Frequence stored in lowest 16 bits
132   int freq = (raw_score & 0x0000ffff);
133   return freq;
134 }
135 
extract_score_lmt(int raw_score)136 inline uint64 UserDict::extract_score_lmt(int raw_score) {
137   uint64 lmt = ((raw_score & 0xffff0000) >> 16);
138   if (kUserDictLMTBitWidth < 16) {
139     uint64 mask = ~(1 << kUserDictLMTBitWidth);
140     lmt &= mask;
141   }
142   lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
143   return lmt;
144 }
145 
build_score(uint64 lmt,int freq)146 inline int UserDict::build_score(uint64 lmt, int freq) {
147   lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
148   lmt = (lmt << (64 - kUserDictLMTBitWidth));
149   lmt = (lmt >> (64 - kUserDictLMTBitWidth));
150   uint16 lmt16 = (uint16)lmt;
151   int s = freq;
152   s &= 0x0000ffff;
153   s = (lmt16 << 16) | s;
154   return s;
155 }
156 
utf16le_atoll(uint16 * s,int len)157 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
158   int64 ret = 0;
159   if (len <= 0)
160     return ret;
161 
162   int flag = 1;
163   const uint16 * endp = s + len;
164   if (*s == '-') {
165     flag = -1;
166     s++;
167   } else if (*s == '+') {
168     s++;
169   }
170 
171   while (*s >= '0' && *s <= '9' && s < endp) {
172     ret += ret * 10 + (*s) - '0';
173     s++;
174   }
175   return ret * flag;
176 }
177 
utf16le_lltoa(int64 v,uint16 * s,int size)178 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
179   if (!s || size <= 0)
180     return 0;
181   uint16 *endp = s + size;
182   int ret_len = 0;
183   if (v < 0) {
184     *(s++) = '-';
185     ++ret_len;
186     v *= -1;
187   }
188 
189   uint16 *b = s;
190   while (s < endp && v != 0) {
191     *(s++) = '0' + (v % 10);
192     v = v / 10;
193     ++ret_len;
194   }
195 
196   if (v != 0)
197     return 0;
198 
199   --s;
200 
201   while (b < s) {
202     *b = *s;
203     ++b, --s;
204   }
205 
206   return ret_len;
207 }
208 
set_lemma_flag(uint32 offset,uint8 flag)209 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
210   offset &= kUserDictOffsetMask;
211   lemmas_[offset] |= flag;
212 }
213 
get_lemma_flag(uint32 offset)214 inline char UserDict::get_lemma_flag(uint32 offset) {
215   offset &= kUserDictOffsetMask;
216   return (char)(lemmas_[offset]);
217 }
218 
get_lemma_nchar(uint32 offset)219 inline char UserDict::get_lemma_nchar(uint32 offset) {
220   offset &= kUserDictOffsetMask;
221   return (char)(lemmas_[offset + 1]);
222 }
223 
get_lemma_spell_ids(uint32 offset)224 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
225   offset &= kUserDictOffsetMask;
226   return (uint16 *)(lemmas_ + offset + 2);
227 }
228 
get_lemma_word(uint32 offset)229 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
230   offset &= kUserDictOffsetMask;
231   uint8 nchar = get_lemma_nchar(offset);
232   return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
233 }
234 
get_max_lemma_id()235 inline LemmaIdType UserDict::get_max_lemma_id() {
236   // When a lemma is deleted, we don't not claim its id back for
237   // simplicity and performance
238   return start_id_ + dict_info_.lemma_count - 1;
239 }
240 
is_valid_lemma_id(LemmaIdType id)241 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
242   if (id >= start_id_ && id <= get_max_lemma_id())
243     return true;
244   return false;
245 }
246 
is_valid_state()247 inline bool UserDict::is_valid_state() {
248   if (state_ == USER_DICT_NONE)
249     return false;
250   return true;
251 }
252 
UserDict()253 UserDict::UserDict()
254     : start_id_(0),
255       version_(0),
256       lemmas_(NULL),
257       offsets_(NULL),
258       scores_(NULL),
259       ids_(NULL),
260 #ifdef ___PREDICT_ENABLED___
261       predicts_(NULL),
262 #endif
263 #ifdef ___SYNC_ENABLED___
264       syncs_(NULL),
265       sync_count_size_(0),
266 #endif
267       offsets_by_id_(NULL),
268       lemma_count_left_(0),
269       lemma_size_left_(0),
270       dict_file_(NULL),
271       state_(USER_DICT_NONE) {
272   memset(&dict_info_, 0, sizeof(dict_info_));
273   memset(&load_time_, 0, sizeof(load_time_));
274 #ifdef ___CACHE_ENABLED___
275   cache_init();
276 #endif
277 }
278 
~UserDict()279 UserDict::~UserDict() {
280   close_dict();
281 }
282 
load_dict(const char * file_name,LemmaIdType start_id,LemmaIdType end_id)283 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
284                          LemmaIdType end_id) {
285 #ifdef ___DEBUG_PERF___
286   DEBUG_PERF_BEGIN;
287 #endif
288   dict_file_ = strdup(file_name);
289   if (!dict_file_)
290     return false;
291 
292   start_id_ = start_id;
293 
294   if (false == validate(file_name) && false == reset(file_name)) {
295     goto error;
296   }
297   if (false == load(file_name, start_id)) {
298     goto error;
299   }
300 
301   state_ = USER_DICT_SYNC;
302 
303   gettimeofday(&load_time_, NULL);
304 
305 #ifdef ___DEBUG_PERF___
306   DEBUG_PERF_END;
307   LOGD_PERF("load_dict");
308 #endif
309   return true;
310  error:
311   free((void*)dict_file_);
312   dict_file_ = NULL;
313   start_id_ = 0;
314   return false;
315 }
316 
close_dict()317 bool UserDict::close_dict() {
318   if (state_ == USER_DICT_NONE)
319     return true;
320   if (state_ == USER_DICT_SYNC)
321     goto out;
322 
323   // If dictionary is written back by others,
324   // we can not simply write back here
325   // To do a safe flush, we have to discard all newly added
326   // lemmas and try to reload dict file.
327   pthread_mutex_lock(&g_mutex_);
328   if (load_time_.tv_sec > g_last_update_.tv_sec ||
329     (load_time_.tv_sec == g_last_update_.tv_sec &&
330      load_time_.tv_usec > g_last_update_.tv_usec)) {
331     write_back();
332     gettimeofday(&g_last_update_, NULL);
333   }
334   pthread_mutex_unlock(&g_mutex_);
335 
336  out:
337   free((void*)dict_file_);
338   free(lemmas_);
339   free(offsets_);
340   free(offsets_by_id_);
341   free(scores_);
342   free(ids_);
343 #ifdef ___PREDICT_ENABLED___
344   free(predicts_);
345 #endif
346 
347   version_ = 0;
348   dict_file_ = NULL;
349   lemmas_ = NULL;
350 #ifdef ___SYNC_ENABLED___
351   syncs_ = NULL;
352   sync_count_size_ = 0;
353 #endif
354   offsets_ = NULL;
355   offsets_by_id_ = NULL;
356   scores_ = NULL;
357   ids_ = NULL;
358 #ifdef ___PREDICT_ENABLED___
359   predicts_ = NULL;
360 #endif
361 
362   memset(&dict_info_, 0, sizeof(dict_info_));
363   lemma_count_left_ = 0;
364   lemma_size_left_ = 0;
365   state_ = USER_DICT_NONE;
366 
367   return true;
368 }
369 
number_of_lemmas()370 size_t UserDict::number_of_lemmas() {
371   return dict_info_.lemma_count;
372 }
373 
reset_milestones(uint16 from_step,MileStoneHandle from_handle)374 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
375   return;
376 }
377 
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)378 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
379                                       const DictExtPara *dep,
380                                       LmaPsbItem *lpi_items,
381                                       size_t lpi_max, size_t *lpi_num) {
382   if (is_valid_state() == false)
383     return 0;
384 
385   bool need_extend = false;
386 
387 #ifdef ___DEBUG_PERF___
388   DEBUG_PERF_BEGIN;
389 #endif
390   *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
391                        lpi_items, lpi_max, &need_extend);
392 #ifdef ___DEBUG_PERF___
393   DEBUG_PERF_END;
394   LOGD_PERF("extend_dict");
395 #endif
396   return ((*lpi_num > 0 || need_extend) ? 1 : 0);
397 }
398 
is_fuzzy_prefix_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)399 int UserDict::is_fuzzy_prefix_spell_id(
400     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
401   if (len1 < searchable->splids_len)
402     return 0;
403 
404   SpellingTrie &spl_trie = SpellingTrie::get_instance();
405   uint32 i = 0;
406   for (i = 0; i < searchable->splids_len; i++) {
407     const char py1 = *spl_trie.get_spelling_str(id1[i]);
408     uint16 off = 8 * (i % 4);
409     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
410     if (py1 == py2)
411       continue;
412     return 0;
413   }
414   return 1;
415 }
416 
fuzzy_compare_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)417 int UserDict::fuzzy_compare_spell_id(
418     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
419   if (len1 < searchable->splids_len)
420     return -1;
421   if (len1 > searchable->splids_len)
422     return 1;
423 
424   SpellingTrie &spl_trie = SpellingTrie::get_instance();
425   uint32 i = 0;
426   for (i = 0; i < len1; i++) {
427     const char py1 = *spl_trie.get_spelling_str(id1[i]);
428     uint16 off = 8 * (i % 4);
429     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
430     if (py1 == py2)
431       continue;
432     if (py1 > py2)
433       return 1;
434     return -1;
435   }
436   return 0;
437 }
438 
is_prefix_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)439 bool UserDict::is_prefix_spell_id(
440     const uint16 * fullids, uint16 fulllen,
441     const UserDictSearchable *searchable) {
442   if (fulllen < searchable->splids_len)
443     return false;
444 
445   uint32 i = 0;
446   for (; i < searchable->splids_len; i++) {
447     uint16 start_id = searchable->splid_start[i];
448     uint16 count = searchable->splid_count[i];
449     if (fullids[i] >= start_id && fullids[i] < start_id + count)
450       continue;
451     else
452       return false;
453   }
454   return true;
455 }
456 
equal_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)457 bool UserDict::equal_spell_id(
458     const uint16 * fullids, uint16 fulllen,
459     const UserDictSearchable *searchable) {
460   if (fulllen != searchable->splids_len)
461     return false;
462 
463   uint32 i = 0;
464   for (; i < fulllen; i++) {
465     uint16 start_id = searchable->splid_start[i];
466     uint16 count = searchable->splid_count[i];
467     if (fullids[i] >= start_id && fullids[i] < start_id + count)
468       continue;
469     else
470       return false;
471   }
472   return true;
473 }
474 
locate_first_in_offsets(const UserDictSearchable * searchable)475 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
476   int32 begin = 0;
477   int32 end = dict_info_.lemma_count - 1;
478   int32 middle = -1;
479 
480   int32 first_prefix = middle;
481   int32 last_matched = middle;
482 
483   while (begin <= end) {
484     middle = (begin + end) >> 1;
485     uint32 offset = offsets_[middle];
486     uint8 nchar = get_lemma_nchar(offset);
487     const uint16 * splids = get_lemma_spell_ids(offset);
488     int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
489     int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
490 
491     if (pre)
492       first_prefix = middle;
493 
494     if (cmp < 0) {
495       begin = middle + 1;
496     } else if (cmp > 0) {
497       end = middle - 1;
498     } else {
499       end = middle - 1;
500       last_matched = middle;
501     }
502   }
503 
504   return first_prefix;
505 }
506 
prepare_locate(UserDictSearchable * searchable,const uint16 * splid_str,uint16 splid_str_len)507 void UserDict::prepare_locate(UserDictSearchable *searchable,
508                              const uint16 *splid_str,
509                              uint16 splid_str_len) {
510   searchable->splids_len = splid_str_len;
511   memset(searchable->signature, 0, sizeof(searchable->signature));
512 
513   SpellingTrie &spl_trie = SpellingTrie::get_instance();
514   uint32 i = 0;
515   for (; i < splid_str_len; i++) {
516     if (spl_trie.is_half_id(splid_str[i])) {
517       searchable->splid_count[i] =
518           spl_trie.half_to_full(splid_str[i],
519                                 &(searchable->splid_start[i]));
520     } else {
521       searchable->splid_count[i] = 1;
522       searchable->splid_start[i] = splid_str[i];
523     }
524     const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
525     searchable->signature[i>>2] |= (py << (8 * (i % 4)));
526   }
527 }
528 
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max)529 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
530                           LmaPsbItem *lpi_items, size_t lpi_max) {
531   return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
532 }
533 
_get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max,bool * need_extend)534 size_t UserDict::_get_lpis(const uint16 *splid_str,
535                            uint16 splid_str_len, LmaPsbItem *lpi_items,
536                            size_t lpi_max, bool * need_extend) {
537   bool tmp_extend;
538   if (!need_extend)
539     need_extend = &tmp_extend;
540 
541   *need_extend = false;
542 
543   if (is_valid_state() == false)
544     return 0;
545   if (lpi_max <= 0)
546     return 0;
547 
548   if (0 == pthread_mutex_trylock(&g_mutex_)) {
549     if (load_time_.tv_sec < g_last_update_.tv_sec ||
550       (load_time_.tv_sec == g_last_update_.tv_sec &&
551        load_time_.tv_usec < g_last_update_.tv_usec)) {
552       // Others updated disk file, have to reload
553       pthread_mutex_unlock(&g_mutex_);
554       flush_cache();
555     } else {
556       pthread_mutex_unlock(&g_mutex_);
557     }
558   } else {
559   }
560 
561   UserDictSearchable searchable;
562   prepare_locate(&searchable, splid_str, splid_str_len);
563 
564   uint32 max_off = dict_info_.lemma_count;
565 #ifdef ___CACHE_ENABLED___
566   int32 middle;
567   uint32 start, count;
568   bool cached = cache_hit(&searchable, &start, &count);
569   if (cached) {
570     middle = start;
571     max_off = start + count;
572   } else {
573     middle = locate_first_in_offsets(&searchable);
574     start = middle;
575   }
576 #else
577   int32 middle = locate_first_in_offsets(&searchable);
578 #endif
579 
580   if (middle == -1) {
581 #ifdef ___CACHE_ENABLED___
582     if (!cached)
583       cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
584 #endif
585     return 0;
586   }
587 
588   size_t lpi_current = 0;
589 
590   bool fuzzy_break = false;
591   bool prefix_break = false;
592   while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
593     if (lpi_current >= lpi_max)
594       break;
595     uint32 offset = offsets_[middle];
596     // Ignore deleted lemmas
597     if (offset & kUserDictOffsetFlagRemove) {
598       middle++;
599       continue;
600     }
601     uint8 nchar = get_lemma_nchar(offset);
602     uint16 * splids = get_lemma_spell_ids(offset);
603 #ifdef ___CACHE_ENABLED___
604     if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
605 #else
606     if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
607 #endif
608       fuzzy_break = true;
609     }
610 
611     if (prefix_break == false) {
612       if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
613         if (*need_extend == false &&
614             is_prefix_spell_id(splids, nchar, &searchable)) {
615           *need_extend = true;
616         }
617       } else {
618         prefix_break = true;
619       }
620     }
621 
622     if (equal_spell_id(splids, nchar, &searchable) == true) {
623       lpi_items[lpi_current].psb = translate_score(scores_[middle]);
624       lpi_items[lpi_current].id = ids_[middle];
625       lpi_items[lpi_current].lma_len = nchar;
626       lpi_current++;
627     }
628     middle++;
629   }
630 
631 #ifdef ___CACHE_ENABLED___
632   if (!cached) {
633     count = middle - start;
634     cache_push(USER_DICT_CACHE, &searchable, start, count);
635   }
636 #endif
637 
638   return lpi_current;
639 }
640 
641 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
642                                uint16 str_max) {
643   if (is_valid_state() == false)
644     return 0;
645   if (is_valid_lemma_id(id_lemma) == false)
646     return 0;
647   uint32 offset = offsets_by_id_[id_lemma - start_id_];
648   uint8 nchar = get_lemma_nchar(offset);
649   char16 * str = get_lemma_word(offset);
650   uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
651   int i = 0;
652   for (; i < m; i++) {
653     str_buf[i] = str[i];
654   }
655   str_buf[i] = 0;
656   return m;
657 }
658 
659 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
660                                   uint16 splids_max, bool arg_valid) {
661   if (is_valid_lemma_id(id_lemma) == false)
662     return 0;
663   uint32 offset = offsets_by_id_[id_lemma - start_id_];
664   uint8 nchar = get_lemma_nchar(offset);
665   const uint16 * ids = get_lemma_spell_ids(offset);
666   int i = 0;
667   for (; i < nchar && i < splids_max; i++)
668     splids[i] = ids[i];
669   return i;
670 }
671 
672 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
673                          NPredictItem *npre_items, size_t npre_max,
674                          size_t b4_used) {
675   uint32 new_added = 0;
676 #ifdef ___PREDICT_ENABLED___
677   int32 end = dict_info_.lemma_count - 1;
678   int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
679   if (j == -1)
680     return 0;
681 
682   while (j <= end) {
683     uint32 offset = predicts_[j];
684     // Ignore deleted lemmas
685     if (offset & kUserDictOffsetFlagRemove) {
686       j++;
687       continue;
688     }
689     uint32 nchar = get_lemma_nchar(offset);
690     uint16 * words = get_lemma_word(offset);
691     uint16 * splids = get_lemma_spell_ids(offset);
692 
693     if (nchar <= hzs_len) {
694       j++;
695       continue;
696     }
697 
698     if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
699       if (new_added >= npre_max) {
700         return new_added;
701       }
702       uint32 cpy_len =
703           (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
704           - (hzs_len << 1);
705       npre_items[new_added].his_len = hzs_len;
706       npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
707       memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
708       if ((cpy_len >> 1) < kMaxPredictSize) {
709         npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
710       }
711       new_added++;
712     } else {
713       break;
714     }
715 
716     j++;
717   }
718 #endif
719   return new_added;
720 }
721 
722 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
723                                   uint16 lemma_len) {
724   int32 max_off = dict_info_.lemma_count;
725 
726   UserDictSearchable searchable;
727   prepare_locate(&searchable, splid_str, lemma_len);
728 #ifdef ___CACHE_ENABLED___
729   int32 off;
730   uint32 start, count;
731   bool cached = load_cache(&searchable, &start, &count);
732   if (cached) {
733     off = start;
734     max_off = start + count;
735   } else {
736     off = locate_first_in_offsets(&searchable);
737     start = off;
738   }
739 #else
740   int32 off = locate_first_in_offsets(&searchable);
741 #endif
742 
743   if (off == -1) {
744     return off;
745   }
746 
747   while (off < max_off) {
748     uint32 offset = offsets_[off];
749     if (offset & kUserDictOffsetFlagRemove) {
750       off++;
751       continue;
752     }
753     uint16 * splids = get_lemma_spell_ids(offset);
754 #ifdef ___CACHE_ENABLED___
755     if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
756       break;
757 #else
758     if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
759       break;
760 #endif
761     if (equal_spell_id(splids, lemma_len, &searchable) == true) {
762       uint16 * str = get_lemma_word(offset);
763       uint32 i = 0;
764       for (i = 0; i < lemma_len; i++) {
765         if (str[i] == lemma_str[i])
766           continue;
767         break;
768       }
769       if (i < lemma_len) {
770         off++;
771         continue;
772       }
773 #ifdef ___CACHE_ENABLED___
774       // No need to save_cache here, since current function is invoked by
775       // put_lemma. It's rarely possible for a user input same lemma twice.
776       // That means first time user type a new lemma, it is newly added into
777       // user dictionary, then it's possible that user type the same lemma
778       // again.
779       // Another reason save_cache can not be invoked here is this function
780       // aborts when lemma is found, and it never knows the count.
781 #endif
782       return off;
783     }
784     off++;
785   }
786 
787   return -1;
788 }
789 
790 #ifdef ___PREDICT_ENABLED___
791 uint32 UserDict::locate_where_to_insert_in_predicts(
792     const uint16 * words, int lemma_len) {
793   int32 begin = 0;
794   int32 end = dict_info_.lemma_count - 1;
795   int32 middle = end;
796 
797   uint32 last_matched = middle;
798 
799   while (begin <= end) {
800     middle = (begin + end) >> 1;
801     uint32 offset = offsets_[middle];
802     uint8 nchar = get_lemma_nchar(offset);
803     const uint16 * ws = get_lemma_word(offset);
804 
805     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
806     uint32 k = 0;
807     int cmp = 0;
808 
809     for (; k < minl; k++) {
810       if (ws[k] < words[k]) {
811         cmp = -1;
812         break;
813       } else if (ws[k] > words[k]) {
814         cmp = 1;
815         break;
816       }
817     }
818     if (cmp == 0) {
819       if (nchar < lemma_len)
820         cmp = -1;
821       else if (nchar > lemma_len)
822         cmp = 1;
823     }
824 
825     if (cmp < 0) {
826       begin = middle + 1;
827       last_matched = middle;
828     } else if (cmp > 0) {
829       end = middle - 1;
830     } else {
831       end = middle - 1;
832       last_matched = middle;
833     }
834   }
835 
836   return last_matched;
837 }
838 
839 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
840   int32 begin = 0;
841   int32 end = dict_info_.lemma_count - 1;
842   int32 middle = -1;
843 
844   int32 last_matched = middle;
845 
846   while (begin <= end) {
847     middle = (begin + end) >> 1;
848     uint32 offset = offsets_[middle];
849     uint8 nchar = get_lemma_nchar(offset);
850     const uint16 * ws = get_lemma_word(offset);
851 
852     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
853     uint32 k = 0;
854     int cmp = 0;
855 
856     for (; k < minl; k++) {
857       if (ws[k] < words[k]) {
858         cmp = -1;
859         break;
860       } else if (ws[k] > words[k]) {
861         cmp = 1;
862         break;
863       }
864     }
865     if (cmp == 0) {
866       if (nchar >= lemma_len)
867         last_matched = middle;
868       if (nchar < lemma_len)
869         cmp = -1;
870       else if (nchar > lemma_len)
871         cmp = 1;
872     }
873 
874     if (cmp < 0) {
875       begin = middle + 1;
876     } else if (cmp > 0) {
877       end = middle - 1;
878     } else {
879       end = middle - 1;
880     }
881   }
882 
883   return last_matched;
884 }
885 
886 #endif
887 
888 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
889                                    uint16 lemma_len) {
890   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
891   if (off == -1) {
892     return 0;
893   }
894 
895   return ids_[off];
896 }
897 
898 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
899   if (is_valid_state() == false)
900     return 0;
901   if (is_valid_lemma_id(lemma_id) == false)
902     return 0;
903 
904   return translate_score(_get_lemma_score(lemma_id));
905 }
906 
907 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
908                                 uint16 lemma_len) {
909   if (is_valid_state() == false)
910     return 0;
911   return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
912 }
913 
914 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
915   if (is_valid_state() == false)
916     return 0;
917   if (is_valid_lemma_id(lemma_id) == false)
918     return 0;
919 
920   uint32 offset = offsets_by_id_[lemma_id - start_id_];
921 
922   uint32 nchar = get_lemma_nchar(offset);
923   uint16 * spl = get_lemma_spell_ids(offset);
924   uint16 * wrd = get_lemma_word(offset);
925 
926   int32 off = locate_in_offsets(wrd, spl, nchar);
927   if (off == -1) {
928     return 0;
929   }
930 
931   return scores_[off];
932 }
933 
934 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
935                                 uint16 lemma_len) {
936   if (is_valid_state() == false)
937     return 0;
938 
939   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
940   if (off == -1) {
941     return 0;
942   }
943 
944   return scores_[off];
945 }
946 
947 #ifdef ___SYNC_ENABLED___
948 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
949   offset &= kUserDictOffsetMask;
950   uint32 i = 0;
951   for (; i < dict_info_.sync_count; i++) {
952     unsigned int off = (syncs_[i] & kUserDictOffsetMask);
953     if (off == offset)
954       break;
955   }
956   if (i < dict_info_.sync_count) {
957     syncs_[i] = syncs_[dict_info_.sync_count - 1];
958     dict_info_.sync_count--;
959   }
960 }
961 #endif
962 
963 #ifdef ___PREDICT_ENABLED___
964 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
965   offset &= kUserDictOffsetMask;
966   uint32 i = 0;
967   for (; i < dict_info_.lemma_count; i++) {
968     unsigned int off = (predicts_[i] & kUserDictOffsetMask);
969     if (off == offset) {
970       predicts_[i] |= kUserDictOffsetFlagRemove;
971       break;
972     }
973   }
974 }
975 #endif
976 
977 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
978   if (is_valid_state() == false)
979     return 0;
980 
981   int32 off = offset_index;
982   if (off == -1) {
983     return false;
984   }
985 
986   uint32 offset = offsets_[off];
987   uint32 nchar = get_lemma_nchar(offset);
988 
989   offsets_[off] |= kUserDictOffsetFlagRemove;
990 
991 #ifdef ___SYNC_ENABLED___
992   // Remove corresponding sync item
993   remove_lemma_from_sync_list(offset);
994 #endif
995 
996 #ifdef ___PREDICT_ENABLED___
997   remove_lemma_from_predict_list(offset);
998 #endif
999   dict_info_.free_count++;
1000   dict_info_.free_size += (2 + (nchar << 2));
1001 
1002   if (state_ < USER_DICT_OFFSET_DIRTY)
1003     state_ = USER_DICT_OFFSET_DIRTY;
1004   return true;
1005 }
1006 
1007 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
1008   if (is_valid_state() == false)
1009     return 0;
1010   if (is_valid_lemma_id(lemma_id) == false)
1011     return false;
1012   uint32 offset = offsets_by_id_[lemma_id - start_id_];
1013 
1014   uint32 nchar = get_lemma_nchar(offset);
1015   uint16 * spl = get_lemma_spell_ids(offset);
1016   uint16 * wrd = get_lemma_word(offset);
1017 
1018   int32 off = locate_in_offsets(wrd, spl, nchar);
1019 
1020   return remove_lemma_by_offset_index(off);
1021 }
1022 
1023 void UserDict::flush_cache() {
1024   LemmaIdType start_id = start_id_;
1025   if (!dict_file_)
1026     return;
1027   const char * file = strdup(dict_file_);
1028   if (!file)
1029     return;
1030   close_dict();
1031   load_dict(file, start_id, kUserDictIdEnd);
1032   free((void*)file);
1033 #ifdef ___CACHE_ENABLED___
1034   cache_init();
1035 #endif
1036   return;
1037 }
1038 
1039 bool UserDict::reset(const char *file) {
1040   FILE *fp = fopen(file, "w+");
1041   if (!fp) {
1042     return false;
1043   }
1044   uint32 version = kUserDictVersion;
1045   size_t wred = fwrite(&version, 1, 4, fp);
1046   UserDictInfo info;
1047   memset(&info, 0, sizeof(info));
1048   // By default, no limitation for lemma count and size
1049   // thereby, reclaim_ratio is never used
1050   wred += fwrite(&info, 1, sizeof(info), fp);
1051   if (wred != sizeof(info) + sizeof(version)) {
1052     fclose(fp);
1053     unlink(file);
1054     return false;
1055   }
1056   fclose(fp);
1057   return true;
1058 }
1059 
1060 bool UserDict::validate(const char *file) {
1061   // b is ignored in POSIX compatible os including Linux
1062   // while b is important flag for Windows to specify binary mode
1063   FILE *fp = fopen(file, "rb");
1064   if (!fp) {
1065     return false;
1066   }
1067 
1068   size_t size;
1069   size_t readed;
1070   uint32 version;
1071   UserDictInfo dict_info;
1072 
1073   // validate
1074   int err = fseek(fp, 0, SEEK_END);
1075   if (err) {
1076     goto error;
1077   }
1078 
1079   size = ftell(fp);
1080   if (size < 4 + sizeof(dict_info)) {
1081     goto error;
1082   }
1083 
1084   err = fseek(fp, 0, SEEK_SET);
1085   if (err) {
1086     goto error;
1087   }
1088 
1089   readed = fread(&version, 1, sizeof(version), fp);
1090   if (readed < sizeof(version)) {
1091     goto error;
1092   }
1093   if (version != kUserDictVersion) {
1094     goto error;
1095   }
1096 
1097   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1098   if (err) {
1099     goto error;
1100   }
1101 
1102   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1103   if (readed != sizeof(dict_info)) {
1104     goto error;
1105   }
1106 
1107   if (size != get_dict_file_size(&dict_info)) {
1108     goto error;
1109   }
1110 
1111   fclose(fp);
1112   return true;
1113 
1114  error:
1115   fclose(fp);
1116   return false;
1117 }
1118 
1119 bool UserDict::load(const char *file, LemmaIdType start_id) {
1120   if (0 != pthread_mutex_trylock(&g_mutex_)) {
1121     return false;
1122   }
1123   // b is ignored in POSIX compatible os including Linux
1124   // while b is important flag for Windows to specify binary mode
1125   FILE *fp = fopen(file, "rb");
1126   if (!fp) {
1127     pthread_mutex_unlock(&g_mutex_);
1128     return false;
1129   }
1130 
1131   size_t readed, toread;
1132   UserDictInfo dict_info;
1133   uint8 *lemmas = NULL;
1134   uint32 *offsets = NULL;
1135 #ifdef ___SYNC_ENABLED___
1136   uint32 *syncs = NULL;
1137 #endif
1138   uint32 *scores = NULL;
1139   uint32 *ids = NULL;
1140   uint32 *offsets_by_id = NULL;
1141 #ifdef ___PREDICT_ENABLED___
1142   uint32 *predicts = NULL;
1143 #endif
1144   size_t i;
1145   int err;
1146 
1147   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1148   if (err) goto error;
1149 
1150   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1151   if (readed != sizeof(dict_info)) goto error;
1152 
1153   lemmas = (uint8 *)malloc(
1154       dict_info.lemma_size +
1155       (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1156 
1157   if (!lemmas) goto error;
1158 
1159   offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1160   if (!offsets) goto error;
1161 
1162 #ifdef ___PREDICT_ENABLED___
1163   predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1164   if (!predicts) goto error;
1165 #endif
1166 
1167 #ifdef ___SYNC_ENABLED___
1168   syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1169   if (!syncs) goto error;
1170 #endif
1171 
1172   scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1173   if (!scores) goto error;
1174 
1175   ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1176   if (!ids) goto error;
1177 
1178   offsets_by_id = (uint32 *)malloc(
1179       (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1180   if (!offsets_by_id) goto error;
1181 
1182   err = fseek(fp, 4, SEEK_SET);
1183   if (err) goto error;
1184 
1185   readed = 0;
1186   while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1187     readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1188   }
1189   if (readed < dict_info.lemma_size)
1190     goto error;
1191 
1192   toread = (dict_info.lemma_count << 2);
1193   readed = 0;
1194   while (readed < toread && !ferror(fp) && !feof(fp)) {
1195     readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1196   }
1197   if (readed < toread)
1198     goto error;
1199 
1200 #ifdef ___PREDICT_ENABLED___
1201   toread = (dict_info.lemma_count << 2);
1202   readed = 0;
1203   while (readed < toread && !ferror(fp) && !feof(fp)) {
1204     readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1205   }
1206   if (readed < toread)
1207     goto error;
1208 #endif
1209 
1210   readed = 0;
1211   while (readed < toread && !ferror(fp) && !feof(fp)) {
1212     readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1213   }
1214   if (readed < toread)
1215     goto error;
1216 
1217 #ifdef ___SYNC_ENABLED___
1218   toread = (dict_info.sync_count << 2);
1219   readed = 0;
1220   while (readed < toread && !ferror(fp) && !feof(fp)) {
1221     readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1222   }
1223   if (readed < toread)
1224     goto error;
1225 #endif
1226 
1227   for (i = 0; i < dict_info.lemma_count; i++) {
1228     ids[i] = start_id + i;
1229     offsets_by_id[i] = offsets[i];
1230   }
1231 
1232   lemmas_ = lemmas;
1233   offsets_ = offsets;
1234 #ifdef ___SYNC_ENABLED___
1235   syncs_ = syncs;
1236   sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1237 #endif
1238   offsets_by_id_ = offsets_by_id;
1239   scores_ = scores;
1240   ids_ = ids;
1241 #ifdef ___PREDICT_ENABLED___
1242   predicts_ = predicts;
1243 #endif
1244   lemma_count_left_ = kUserDictPreAlloc;
1245   lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1246   memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1247   state_ = USER_DICT_SYNC;
1248 
1249   fclose(fp);
1250 
1251   pthread_mutex_unlock(&g_mutex_);
1252   return true;
1253 
1254  error:
1255   if (lemmas) free(lemmas);
1256   if (offsets) free(offsets);
1257 #ifdef ___SYNC_ENABLED___
1258   if (syncs) free(syncs);
1259 #endif
1260   if (scores) free(scores);
1261   if (ids) free(ids);
1262   if (offsets_by_id) free(offsets_by_id);
1263 #ifdef ___PREDICT_ENABLED___
1264   if (predicts) free(predicts);
1265 #endif
1266   fclose(fp);
1267   pthread_mutex_unlock(&g_mutex_);
1268   return false;
1269 }
1270 
1271 void UserDict::write_back() {
1272   // XXX write back is only allowed from close_dict due to thread-safe sake
1273   if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1274     return;
1275   int fd = open(dict_file_, O_WRONLY);
1276   if (fd == -1)
1277     return;
1278   switch (state_) {
1279     case USER_DICT_DEFRAGMENTED:
1280       write_back_all(fd);
1281       break;
1282     case USER_DICT_LEMMA_DIRTY:
1283       write_back_lemma(fd);
1284       break;
1285     case USER_DICT_OFFSET_DIRTY:
1286       write_back_offset(fd);
1287       break;
1288     case USER_DICT_SCORE_DIRTY:
1289       write_back_score(fd);
1290       break;
1291 #ifdef ___SYNC_ENABLED___
1292     case USER_DICT_SYNC_DIRTY:
1293       write_back_sync(fd);
1294       break;
1295 #endif
1296     default:
1297       break;
1298   }
1299   // It seems truncate is not need on Linux, Windows except Mac
1300   // I am doing it here anyway for safety.
1301   off_t cur = lseek(fd, 0, SEEK_CUR);
1302 #ifndef _WIN32
1303   ftruncate(fd, cur);
1304 #endif
1305   close(fd);
1306   state_ = USER_DICT_SYNC;
1307 }
1308 
1309 #ifdef ___SYNC_ENABLED___
1310 void UserDict::write_back_sync(int fd) {
1311   int err = lseek(fd, 4 + dict_info_.lemma_size
1312                   + (dict_info_.lemma_count << 3)
1313 #ifdef ___PREDICT_ENABLED___
1314                   + (dict_info_.lemma_count << 2)
1315 #endif
1316                   , SEEK_SET);
1317   if (err == -1)
1318     return;
1319   write(fd, syncs_, dict_info_.sync_count << 2);
1320   write(fd, &dict_info_, sizeof(dict_info_));
1321 }
1322 #endif
1323 
1324 void UserDict::write_back_offset(int fd) {
1325   int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1326   if (err == -1)
1327     return;
1328   write(fd, offsets_, dict_info_.lemma_count << 2);
1329 #ifdef ___PREDICT_ENABLED___
1330   write(fd, predicts_, dict_info_.lemma_count << 2);
1331 #endif
1332   write(fd, scores_, dict_info_.lemma_count << 2);
1333 #ifdef ___SYNC_ENABLED___
1334   write(fd, syncs_, dict_info_.sync_count << 2);
1335 #endif
1336   write(fd, &dict_info_, sizeof(dict_info_));
1337 }
1338 
1339 void UserDict::write_back_score(int fd) {
1340   int err = lseek(fd, 4 + dict_info_.lemma_size
1341                   + (dict_info_.lemma_count << 2)
1342 #ifdef ___PREDICT_ENABLED___
1343                   + (dict_info_.lemma_count << 2)
1344 #endif
1345                   , SEEK_SET);
1346   if (err == -1)
1347     return;
1348   write(fd, scores_, dict_info_.lemma_count << 2);
1349 #ifdef ___SYNC_ENABLED___
1350   write(fd, syncs_, dict_info_.sync_count << 2);
1351 #endif
1352   write(fd, &dict_info_, sizeof(dict_info_));
1353 }
1354 
1355 void UserDict::write_back_lemma(int fd) {
1356   int err = lseek(fd, 4, SEEK_SET);
1357   if (err == -1)
1358     return;
1359   // New lemmas are always appended, no need to write whole lemma block
1360   size_t need_write = kUserDictPreAlloc *
1361       (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1362   err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1363   if (err == -1)
1364     return;
1365   write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1366 
1367   write(fd, offsets_,  dict_info_.lemma_count << 2);
1368 #ifdef ___PREDICT_ENABLED___
1369   write(fd, predicts_,  dict_info_.lemma_count << 2);
1370 #endif
1371   write(fd, scores_, dict_info_.lemma_count << 2);
1372 #ifdef ___SYNC_ENABLED___
1373   write(fd, syncs_, dict_info_.sync_count << 2);
1374 #endif
1375   write(fd, &dict_info_, sizeof(dict_info_));
1376 }
1377 
1378 void UserDict::write_back_all(int fd) {
1379   // XXX lemma_size is handled differently in writeall
1380   // and writelemma. I update lemma_size and lemma_count in different
1381   // places for these two cases. Should fix it to make it consistent.
1382   int err = lseek(fd, 4, SEEK_SET);
1383   if (err == -1)
1384     return;
1385   write(fd, lemmas_, dict_info_.lemma_size);
1386   write(fd, offsets_, dict_info_.lemma_count << 2);
1387 #ifdef ___PREDICT_ENABLED___
1388   write(fd, predicts_, dict_info_.lemma_count << 2);
1389 #endif
1390   write(fd, scores_, dict_info_.lemma_count << 2);
1391 #ifdef ___SYNC_ENABLED___
1392   write(fd, syncs_, dict_info_.sync_count << 2);
1393 #endif
1394   write(fd, &dict_info_, sizeof(dict_info_));
1395 }
1396 
1397 #ifdef ___CACHE_ENABLED___
1398 bool UserDict::load_cache(UserDictSearchable *searchable,
1399                           uint32 *offset, uint32 *length) {
1400   UserDictCache *cache = &caches_[searchable->splids_len - 1];
1401   if (cache->head == cache->tail)
1402     return false;
1403 
1404   uint16 j, sig_len = kMaxLemmaSize / 4;
1405   uint16 i = cache->head;
1406   while (1) {
1407     j = 0;
1408     for (; j < sig_len; j++) {
1409       if (cache->signatures[i][j] != searchable->signature[j])
1410         break;
1411     }
1412     if (j < sig_len) {
1413       i++;
1414       if (i >= kUserDictCacheSize)
1415         i -= kUserDictCacheSize;
1416       if (i == cache->tail)
1417         break;
1418       continue;
1419     }
1420     *offset = cache->offsets[i];
1421     *length = cache->lengths[i];
1422     return true;
1423   }
1424   return false;
1425 }
1426 
1427 void UserDict::save_cache(UserDictSearchable *searchable,
1428                           uint32 offset, uint32 length) {
1429   UserDictCache *cache = &caches_[searchable->splids_len - 1];
1430   uint16 next = cache->tail;
1431 
1432   cache->offsets[next] = offset;
1433   cache->lengths[next] = length;
1434   uint16 sig_len = kMaxLemmaSize / 4;
1435   uint16 j = 0;
1436   for (; j < sig_len; j++) {
1437     cache->signatures[next][j] = searchable->signature[j];
1438   }
1439 
1440   if (++next >= kUserDictCacheSize) {
1441     next -= kUserDictCacheSize;
1442   }
1443   if (next == cache->head) {
1444     cache->head++;
1445     if (cache->head >= kUserDictCacheSize) {
1446       cache->head -= kUserDictCacheSize;
1447     }
1448   }
1449   cache->tail = next;
1450 }
1451 
1452 void UserDict::reset_cache() {
1453   memset(caches_, 0, sizeof(caches_));
1454 }
1455 
1456 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1457   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1458   if (cache->head == cache->tail)
1459     return false;
1460 
1461   uint16 j, sig_len = kMaxLemmaSize / 4;
1462   uint16 i = cache->head;
1463   while (1) {
1464     j = 0;
1465     for (; j < sig_len; j++) {
1466       if (cache->signatures[i][j] != searchable->signature[j])
1467         break;
1468     }
1469     if (j < sig_len) {
1470       i++;
1471       if (i >= kUserDictMissCacheSize)
1472         i -= kUserDictMissCacheSize;
1473       if (i == cache->tail)
1474         break;
1475       continue;
1476     }
1477     return true;
1478   }
1479   return false;
1480 }
1481 
1482 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1483   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1484   uint16 next = cache->tail;
1485 
1486   uint16 sig_len = kMaxLemmaSize / 4;
1487   uint16 j = 0;
1488   for (; j < sig_len; j++) {
1489     cache->signatures[next][j] = searchable->signature[j];
1490   }
1491 
1492   if (++next >= kUserDictMissCacheSize) {
1493     next -= kUserDictMissCacheSize;
1494   }
1495   if (next == cache->head) {
1496     cache->head++;
1497     if (cache->head >= kUserDictMissCacheSize) {
1498       cache->head -= kUserDictMissCacheSize;
1499     }
1500   }
1501   cache->tail = next;
1502 }
1503 
1504 void UserDict::reset_miss_cache() {
1505   memset(miss_caches_, 0, sizeof(miss_caches_));
1506 }
1507 
1508 void UserDict::cache_init() {
1509   reset_cache();
1510   reset_miss_cache();
1511 }
1512 
1513 bool UserDict::cache_hit(UserDictSearchable *searchable,
1514                          uint32 *offset, uint32 *length) {
1515   bool hit = load_miss_cache(searchable);
1516   if (hit) {
1517     *offset = 0;
1518     *length = 0;
1519     return true;
1520   }
1521   hit = load_cache(searchable, offset, length);
1522   if (hit) {
1523     return true;
1524   }
1525   return false;
1526 }
1527 
1528 void UserDict::cache_push(UserDictCacheType type,
1529                          UserDictSearchable *searchable,
1530                          uint32 offset, uint32 length) {
1531   switch (type) {
1532     case USER_DICT_MISS_CACHE:
1533       save_miss_cache(searchable);
1534       break;
1535     case USER_DICT_CACHE:
1536       save_cache(searchable, offset, length);
1537       break;
1538     default:
1539       break;
1540   }
1541 }
1542 
1543 #endif
1544 
1545 void UserDict::defragment(void) {
1546 #ifdef ___DEBUG_PERF___
1547   DEBUG_PERF_BEGIN;
1548 #endif
1549   if (is_valid_state() == false)
1550     return;
1551   // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1552   size_t first_freed = 0;
1553   size_t first_inuse = 0;
1554   while (first_freed < dict_info_.lemma_count) {
1555     // Find first freed offset
1556     while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1557             first_freed < dict_info_.lemma_count) {
1558       first_freed++;
1559     }
1560     if (first_freed < dict_info_.lemma_count) {
1561       // Save REMOVE flag to lemma flag
1562       int off = offsets_[first_freed];
1563       set_lemma_flag(off, kUserDictLemmaFlagRemove);
1564     } else {
1565       break;
1566     }
1567     // Find first inuse offse after first_freed
1568     first_inuse = first_freed + 1;
1569     while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1570            (first_inuse < dict_info_.lemma_count)) {
1571       // Save REMOVE flag to lemma flag
1572       int off = offsets_[first_inuse];
1573       set_lemma_flag(off, kUserDictLemmaFlagRemove);
1574       first_inuse++;
1575     }
1576     if (first_inuse >= dict_info_.lemma_count) {
1577       break;
1578     }
1579     // Swap offsets_
1580     int tmp = offsets_[first_inuse];
1581     offsets_[first_inuse] = offsets_[first_freed];
1582     offsets_[first_freed] = tmp;
1583     // Move scores_, no need to swap
1584     tmp = scores_[first_inuse];
1585     scores_[first_inuse] = scores_[first_freed];
1586     scores_[first_freed] = tmp;
1587     // Swap ids_
1588     LemmaIdType tmpid = ids_[first_inuse];
1589     ids_[first_inuse] = ids_[first_freed];
1590     ids_[first_freed] = tmpid;
1591     // Go on
1592     first_freed++;
1593   }
1594 #ifdef ___PREDICT_ENABLED___
1595   // Fixup predicts_
1596   first_freed = 0;
1597   first_inuse = 0;
1598   while (first_freed < dict_info_.lemma_count) {
1599     // Find first freed offset
1600     while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1601             first_freed < dict_info_.lemma_count) {
1602       first_freed++;
1603     }
1604     if (first_freed >= dict_info_.lemma_count)
1605       break;
1606     // Find first inuse offse after first_freed
1607     first_inuse = first_freed + 1;
1608     while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1609            && (first_inuse < dict_info_.lemma_count)) {
1610       first_inuse++;
1611     }
1612     if (first_inuse >= dict_info_.lemma_count) {
1613       break;
1614     }
1615     // Swap offsets_
1616     int tmp = predicts_[first_inuse];
1617     predicts_[first_inuse] = predicts_[first_freed];
1618     predicts_[first_freed] = tmp;
1619     // Go on
1620     first_freed++;
1621   }
1622 #endif
1623   dict_info_.lemma_count = first_freed;
1624   // Fixup lemmas_
1625   size_t begin = 0;
1626   size_t end = 0;
1627   size_t dst = 0;
1628   int total_size = dict_info_.lemma_size + lemma_size_left_;
1629   int total_count = dict_info_.lemma_count + lemma_count_left_;
1630   size_t real_size = total_size - lemma_size_left_;
1631   while (dst < real_size) {
1632     unsigned char flag = get_lemma_flag(dst);
1633     unsigned char nchr = get_lemma_nchar(dst);
1634     if ((flag & kUserDictLemmaFlagRemove) == 0) {
1635       dst += nchr * 4 + 2;
1636       continue;
1637     }
1638     break;
1639   }
1640   if (dst >= real_size)
1641     return;
1642 
1643   end = dst;
1644   while (end < real_size) {
1645     begin = end + get_lemma_nchar(end) * 4 + 2;
1646  repeat:
1647     // not used any more
1648     if (begin >= real_size)
1649       break;
1650     unsigned char flag = get_lemma_flag(begin);
1651     unsigned char nchr = get_lemma_nchar(begin);
1652     if (flag & kUserDictLemmaFlagRemove) {
1653       begin += nchr * 4 + 2;
1654       goto repeat;
1655     }
1656     end = begin + nchr * 4 + 2;
1657     while (end < real_size) {
1658       unsigned char eflag = get_lemma_flag(end);
1659       unsigned char enchr = get_lemma_nchar(end);
1660       if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1661         end += enchr * 4 + 2;
1662         continue;
1663       }
1664       break;
1665     }
1666     memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1667     for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1668       if (offsets_[j] >= begin && offsets_[j] < end) {
1669         offsets_[j] -= (begin - dst);
1670         offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1671       }
1672 #ifdef ___PREDICT_ENABLED___
1673       if (predicts_[j] >= begin && predicts_[j] < end) {
1674         predicts_[j] -= (begin - dst);
1675       }
1676 #endif
1677     }
1678 #ifdef ___SYNC_ENABLED___
1679     for (size_t j = 0; j < dict_info_.sync_count; j++) {
1680       if (syncs_[j] >= begin && syncs_[j] < end) {
1681         syncs_[j] -= (begin - dst);
1682       }
1683     }
1684 #endif
1685     dst += (end - begin);
1686   }
1687 
1688   dict_info_.free_count = 0;
1689   dict_info_.free_size = 0;
1690   dict_info_.lemma_size = dst;
1691   lemma_size_left_ = total_size - dict_info_.lemma_size;
1692   lemma_count_left_ = total_count - dict_info_.lemma_count;
1693 
1694   // XXX Without following code,
1695   // offsets_by_id_ is not reordered.
1696   // That's to say, all removed lemmas' ids are not collected back.
1697   // There may not be room for addition of new lemmas due to
1698   // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1699   // By default, we do want defrag as fast as possible, because
1700   // during defrag procedure, other peers can not write new lemmas
1701   // to user dictionary file.
1702   // XXX If write-back is invoked immediately after
1703   // this defragment, no need to fix up following in-mem data.
1704   for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1705     ids_[i] = start_id_ + i;
1706     offsets_by_id_[i] = offsets_[i];
1707   }
1708 
1709   state_ = USER_DICT_DEFRAGMENTED;
1710 
1711 #ifdef ___DEBUG_PERF___
1712   DEBUG_PERF_END;
1713   LOGD_PERF("defragment");
1714 #endif
1715 }
1716 
1717 #ifdef ___SYNC_ENABLED___
1718 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1719   if (is_valid_state() == false)
1720     return;
1721   if (end > dict_info_.sync_count)
1722     end = dict_info_.sync_count;
1723   memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1724   dict_info_.sync_count -= (end - start);
1725   if (state_ < USER_DICT_SYNC_DIRTY)
1726     state_ = USER_DICT_SYNC_DIRTY;
1727 }
1728 
1729 int UserDict::get_sync_count() {
1730   if (is_valid_state() == false)
1731     return 0;
1732   return dict_info_.sync_count;
1733 }
1734 
1735 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1736                         uint16 lemma_len, uint16 count, uint64 lmt) {
1737   int again = 0;
1738  begin:
1739   LemmaIdType id;
1740   uint32 * syncs_bak = syncs_;
1741   syncs_ = NULL;
1742   id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1743   syncs_ = syncs_bak;
1744   if (id == 0 && again == 0) {
1745     if ((dict_info_.limit_lemma_count > 0 &&
1746         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1747         || (dict_info_.limit_lemma_size > 0 &&
1748             dict_info_.lemma_size + (2 + (lemma_len << 2))
1749             > dict_info_.limit_lemma_size)) {
1750       // XXX Always reclaim and defrag in sync code path
1751       //     sync thread is background thread and ok with heavy work
1752       reclaim();
1753       defragment();
1754       flush_cache();
1755       again = 1;
1756       goto begin;
1757     }
1758   }
1759   return id;
1760 }
1761 
1762 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1763   int newly_added = 0;
1764 
1765   SpellingParser * spl_parser = new SpellingParser();
1766   if (!spl_parser) {
1767     return 0;
1768   }
1769 #ifdef ___DEBUG_PERF___
1770   DEBUG_PERF_BEGIN;
1771 #endif
1772   char16 *ptr = lemmas;
1773 
1774   // Extract pinyin,words,frequence,last_mod_time
1775   char16 * p = ptr, * py16 = ptr;
1776   char16 * hz16 = NULL;
1777   int py16_len = 0;
1778   uint16 splid[kMaxLemmaSize];
1779   int splid_len = 0;
1780   int hz16_len = 0;
1781   char16 * fr16 = NULL;
1782   int fr16_len = 0;
1783 
1784   while (p - ptr < len) {
1785     // Pinyin
1786     py16 = p;
1787     splid_len = 0;
1788     while (*p != 0x2c && (p - ptr) < len) {
1789       if (*p == 0x20)
1790         splid_len++;
1791       p++;
1792     }
1793     splid_len++;
1794     if (p - ptr == len)
1795       break;
1796     py16_len = p - py16;
1797     if (kMaxLemmaSize < splid_len) {
1798       break;
1799     }
1800     bool is_pre;
1801     int splidl = spl_parser->splstr16_to_idxs_f(
1802         py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1803     if (splidl != splid_len)
1804       break;
1805     // Phrase
1806     hz16 = ++p;
1807     while (*p != 0x2c && (p - ptr) < len) {
1808       p++;
1809     }
1810     hz16_len = p - hz16;
1811     if (hz16_len != splid_len)
1812       break;
1813     // Frequency
1814     fr16 = ++p;
1815     fr16_len = 0;
1816     while (*p != 0x2c && (p - ptr) < len) {
1817       p++;
1818     }
1819     fr16_len = p - fr16;
1820     uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1821     // Last modified time
1822     fr16 = ++p;
1823     fr16_len = 0;
1824     while (*p != 0x3b && (p - ptr) < len) {
1825       p++;
1826     }
1827     fr16_len = p - fr16;
1828     uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1829 
1830     put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1831     newly_added++;
1832 
1833     p++;
1834   }
1835 
1836 #ifdef ___DEBUG_PERF___
1837   DEBUG_PERF_END;
1838   LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1839 #endif
1840   return newly_added;
1841 }
1842 
1843 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1844     char16 * str, int size, int * count) {
1845   int len = 0;
1846   *count = 0;
1847 
1848   int left_len = size;
1849 
1850   if (is_valid_state() == false)
1851     return len;
1852 
1853   SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1854   if (!spl_trie) {
1855     return 0;
1856   }
1857 
1858   uint32 i;
1859   for (i = 0; i < dict_info_.sync_count; i++) {
1860     int offset = syncs_[i];
1861     uint32 nchar = get_lemma_nchar(offset);
1862     uint16 *spl = get_lemma_spell_ids(offset);
1863     uint16 *wrd = get_lemma_word(offset);
1864     int score = _get_lemma_score(wrd, spl, nchar);
1865 
1866     static char score_temp[32], *pscore_temp = score_temp;
1867     static char16 temp[256], *ptemp = temp;
1868 
1869     pscore_temp = score_temp;
1870     ptemp = temp;
1871 
1872     uint32 j;
1873     // Add pinyin
1874     for (j = 0; j < nchar; j++) {
1875       int ret_len = spl_trie->get_spelling_str16(
1876           spl[j], ptemp, temp + sizeof(temp) - ptemp);
1877       if (ret_len <= 0)
1878         break;
1879       ptemp += ret_len;
1880       if (ptemp < temp + sizeof(temp) - 1) {
1881         *(ptemp++) = ' ';
1882       } else {
1883         j = 0;
1884         break;
1885       }
1886     }
1887     if (j < nchar) {
1888       continue;
1889     }
1890     ptemp--;
1891     if (ptemp < temp + sizeof(temp) - 1) {
1892       *(ptemp++) = ',';
1893     } else {
1894       continue;
1895     }
1896     // Add phrase
1897     for (j = 0; j < nchar; j++) {
1898       if (ptemp < temp + sizeof(temp) - 1) {
1899         *(ptemp++) = wrd[j];
1900       } else {
1901         break;
1902       }
1903     }
1904     if (j < nchar) {
1905       continue;
1906     }
1907     if (ptemp < temp + sizeof(temp) - 1) {
1908       *(ptemp++) = ',';
1909     } else {
1910       continue;
1911     }
1912     // Add frequency
1913     uint32 intf = extract_score_freq(score);
1914     int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1915     if (ret_len <= 0)
1916       continue;
1917     ptemp += ret_len;
1918     if (ptemp < temp + sizeof(temp) - 1) {
1919       *(ptemp++) = ',';
1920     } else {
1921       continue;
1922     }
1923     // Add last modified time
1924     uint64 last_mod = extract_score_lmt(score);
1925     ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1926     if (ret_len <= 0)
1927       continue;
1928     ptemp += ret_len;
1929     if (ptemp < temp + sizeof(temp) - 1) {
1930       *(ptemp++) = ';';
1931     } else {
1932       continue;
1933     }
1934 
1935     // Write to string
1936     int need_len = ptemp - temp;
1937     if (need_len > left_len)
1938       break;
1939     memcpy(str + len, temp, need_len * 2);
1940     left_len -= need_len;
1941 
1942     len += need_len;
1943     (*count)++;
1944   }
1945 
1946   if (len > 0) {
1947     if (state_ < USER_DICT_SYNC_DIRTY)
1948       state_ = USER_DICT_SYNC_DIRTY;
1949   }
1950   return len;
1951 }
1952 
1953 #endif
1954 
1955 bool UserDict::state(UserDictStat * stat) {
1956   if (is_valid_state() == false)
1957     return false;
1958   if (!stat)
1959     return false;
1960   stat->version = version_;
1961   stat->file_name = dict_file_;
1962   stat->load_time.tv_sec = load_time_.tv_sec;
1963   stat->load_time.tv_usec = load_time_.tv_usec;
1964   pthread_mutex_lock(&g_mutex_);
1965   stat->last_update.tv_sec = g_last_update_.tv_sec;
1966   stat->last_update.tv_usec = g_last_update_.tv_usec;
1967   pthread_mutex_unlock(&g_mutex_);
1968   stat->disk_size = get_dict_file_size(&dict_info_);
1969   stat->lemma_count = dict_info_.lemma_count;
1970   stat->lemma_size = dict_info_.lemma_size;
1971   stat->delete_count = dict_info_.free_count;
1972   stat->delete_size = dict_info_.free_size;
1973 #ifdef ___SYNC_ENABLED___
1974   stat->sync_count = dict_info_.sync_count;
1975 #endif
1976   stat->limit_lemma_count = dict_info_.limit_lemma_count;
1977   stat->limit_lemma_size = dict_info_.limit_lemma_size;
1978   stat->reclaim_ratio = dict_info_.reclaim_ratio;
1979   return true;
1980 }
1981 
1982 void UserDict::set_limit(uint32 max_lemma_count,
1983                          uint32 max_lemma_size, uint32 reclaim_ratio) {
1984   dict_info_.limit_lemma_count = max_lemma_count;
1985   dict_info_.limit_lemma_size = max_lemma_size;
1986   if (reclaim_ratio > 100)
1987     reclaim_ratio = 100;
1988   dict_info_.reclaim_ratio = reclaim_ratio;
1989 }
1990 
1991 void UserDict::reclaim() {
1992   if (is_valid_state() == false)
1993     return;
1994 
1995   switch (dict_info_.reclaim_ratio) {
1996     case 0:
1997       return;
1998     case 100:
1999       // TODO: CLEAR to be implemented
2000       assert(false);
2001       return;
2002     default:
2003       break;
2004   }
2005 
2006   // XXX Reclaim is only based on count, not size
2007   uint32 count = dict_info_.lemma_count;
2008   int rc = count * dict_info_.reclaim_ratio / 100;
2009 
2010   UserDictScoreOffsetPair * score_offset_pairs = NULL;
2011   score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
2012       sizeof(UserDictScoreOffsetPair) * rc);
2013   if (score_offset_pairs == NULL) {
2014     return;
2015   }
2016 
2017   for (int i = 0; i < rc; i++) {
2018     int s = scores_[i];
2019     score_offset_pairs[i].score = s;
2020     score_offset_pairs[i].offset_index = i;
2021   }
2022 
2023   for (int i = (rc + 1) / 2; i >= 0; i--)
2024     shift_down(score_offset_pairs, i, rc);
2025 
2026   for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
2027     int s = scores_[i];
2028     if (s < score_offset_pairs[0].score) {
2029       score_offset_pairs[0].score = s;
2030       score_offset_pairs[0].offset_index = i;
2031       shift_down(score_offset_pairs, 0, rc);
2032     }
2033   }
2034 
2035   for (int i = 0; i < rc; i++) {
2036     int off = score_offset_pairs[i].offset_index;
2037     remove_lemma_by_offset_index(off);
2038   }
2039   if (rc > 0) {
2040     if (state_ < USER_DICT_OFFSET_DIRTY)
2041       state_ = USER_DICT_OFFSET_DIRTY;
2042   }
2043 
2044   free(score_offset_pairs);
2045 }
2046 
2047 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2048   int s = sop[i].score;
2049   int p = sop[i].offset_index;
2050   sop[i].score = sop[j].score;
2051   sop[i].offset_index = sop[j].offset_index;
2052   sop[j].score = s;
2053   sop[j].offset_index = p;
2054 }
2055 
2056 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2057   int par = i;
2058   while (par < n) {
2059     int left = par * 2 + 1;
2060     int right = left + 1;
2061     if (left >= n && right >= n)
2062       break;
2063     if (right >= n) {
2064       if (sop[left].score > sop[par].score) {
2065         swap(sop, left, par);
2066         par = left;
2067         continue;
2068       }
2069     } else if (sop[left].score > sop[right].score &&
2070                sop[left].score > sop[par].score) {
2071       swap(sop, left, par);
2072       par = left;
2073       continue;
2074     } else if (sop[right].score > sop[left].score &&
2075                sop[right].score > sop[par].score) {
2076       swap(sop, right, par);
2077       par = right;
2078       continue;
2079     }
2080     break;
2081   }
2082 }
2083 
2084 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2085                                 uint16 lemma_len, uint16 count) {
2086   return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2087 }
2088 
2089 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2090                                  uint16 lemma_len, uint16 count, uint64 lmt) {
2091 #ifdef ___DEBUG_PERF___
2092   DEBUG_PERF_BEGIN;
2093 #endif
2094   if (is_valid_state() == false)
2095     return 0;
2096   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2097   if (off != -1) {
2098     int delta_score = count - scores_[off];
2099     dict_info_.total_nfreq += delta_score;
2100     scores_[off] = build_score(lmt, count);
2101     if (state_ < USER_DICT_SCORE_DIRTY)
2102       state_ = USER_DICT_SCORE_DIRTY;
2103 #ifdef ___DEBUG_PERF___
2104     DEBUG_PERF_END;
2105     LOGD_PERF("_put_lemma(update)");
2106 #endif
2107     return ids_[off];
2108   } else {
2109     if ((dict_info_.limit_lemma_count > 0 &&
2110         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2111         || (dict_info_.limit_lemma_size > 0 &&
2112             dict_info_.lemma_size + (2 + (lemma_len << 2))
2113             > dict_info_.limit_lemma_size)) {
2114       // XXX Don't defragment here, it's too time-consuming.
2115       return 0;
2116     }
2117     int flushed = 0;
2118     if (lemma_count_left_ == 0 ||
2119         lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2120 
2121       // XXX When there is no space for new lemma, we flush to disk
2122       // flush_cache() may be called by upper user
2123       // and better place shoule be found instead of here
2124       flush_cache();
2125       flushed = 1;
2126       // Or simply return and do nothing
2127       // return 0;
2128     }
2129 #ifdef ___DEBUG_PERF___
2130     DEBUG_PERF_END;
2131     LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2132 #endif
2133     LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2134 #ifdef ___SYNC_ENABLED___
2135     if (syncs_ && id != 0) {
2136       queue_lemma_for_sync(id);
2137     }
2138 #endif
2139     return id;
2140   }
2141   return 0;
2142 }
2143 
2144 #ifdef ___SYNC_ENABLED___
2145 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2146   if (dict_info_.sync_count < sync_count_size_) {
2147     syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2148   } else {
2149     uint32 * syncs = (uint32*)realloc(
2150         syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2151     if (syncs) {
2152       sync_count_size_ += kUserDictPreAlloc;
2153       syncs_ = syncs;
2154       syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2155     }
2156   }
2157 }
2158 #endif
2159 
2160 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2161                                    bool selected) {
2162 #ifdef ___DEBUG_PERF___
2163   DEBUG_PERF_BEGIN;
2164 #endif
2165   if (is_valid_state() == false)
2166     return 0;
2167   if (is_valid_lemma_id(lemma_id) == false)
2168     return 0;
2169   uint32 offset = offsets_by_id_[lemma_id - start_id_];
2170   uint8 lemma_len = get_lemma_nchar(offset);
2171   char16 * lemma_str = get_lemma_word(offset);
2172   uint16 * splids = get_lemma_spell_ids(offset);
2173 
2174   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2175   if (off != -1) {
2176     int score = scores_[off];
2177     int count = extract_score_freq(score);
2178     uint64 lmt = extract_score_lmt(score);
2179     if (count + delta_count > kUserDictMaxFrequency ||
2180         count + delta_count < count) {
2181       delta_count = kUserDictMaxFrequency - count;
2182     }
2183     count += delta_count;
2184     dict_info_.total_nfreq += delta_count;
2185     if (selected) {
2186       lmt = time(NULL);
2187     }
2188     scores_[off] = build_score(lmt, count);
2189     if (state_ < USER_DICT_SCORE_DIRTY)
2190       state_ = USER_DICT_SCORE_DIRTY;
2191 #ifdef ___DEBUG_PERF___
2192     DEBUG_PERF_END;
2193     LOGD_PERF("update_lemma");
2194 #endif
2195 #ifdef ___SYNC_ENABLED___
2196     queue_lemma_for_sync(ids_[off]);
2197 #endif
2198     return ids_[off];
2199   }
2200   return 0;
2201 }
2202 
2203 size_t UserDict::get_total_lemma_count() {
2204   return dict_info_.total_nfreq;
2205 }
2206 
2207 void UserDict::set_total_lemma_count_of_others(size_t count) {
2208   total_other_nfreq_ = count;
2209 }
2210 
2211 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2212                                    uint16 lemma_len, uint16 count, uint64 lmt) {
2213   LemmaIdType id = get_max_lemma_id() + 1;
2214   size_t offset = dict_info_.lemma_size;
2215   if (offset > kUserDictOffsetMask)
2216     return 0;
2217 
2218   lemmas_[offset] = 0;
2219   lemmas_[offset + 1] = (uint8)lemma_len;
2220   for (size_t i = 0; i < lemma_len; i++) {
2221     *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2222     *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2223         = lemma_str[i];
2224   }
2225   uint32 off = dict_info_.lemma_count;
2226   offsets_[off] = offset;
2227   scores_[off] = build_score(lmt, count);
2228   ids_[off] = id;
2229 #ifdef ___PREDICT_ENABLED___
2230   predicts_[off] = offset;
2231 #endif
2232 
2233   offsets_by_id_[id - start_id_] = offset;
2234 
2235   dict_info_.lemma_count++;
2236   dict_info_.lemma_size += (2 + (lemma_len << 2));
2237   lemma_count_left_--;
2238   lemma_size_left_ -= (2 + (lemma_len << 2));
2239 
2240   // Sort
2241 
2242   UserDictSearchable searchable;
2243   prepare_locate(&searchable, splids, lemma_len);
2244 
2245   size_t i = 0;
2246   while (i < off) {
2247     offset = offsets_[i];
2248     uint32 nchar = get_lemma_nchar(offset);
2249     uint16 * spl = get_lemma_spell_ids(offset);
2250 
2251     if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2252       break;
2253     i++;
2254   }
2255   if (i != off) {
2256     uint32 temp = offsets_[off];
2257     memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2258     offsets_[i] = temp;
2259 
2260     temp = scores_[off];
2261     memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2262     scores_[i] = temp;
2263 
2264     temp = ids_[off];
2265     memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2266     ids_[i] = temp;
2267   }
2268 
2269 #ifdef ___PREDICT_ENABLED___
2270   uint32 j = 0;
2271   uint16 * words_new = get_lemma_word(predicts_[off]);
2272   j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2273   if (j != off) {
2274     uint32 temp = predicts_[off];
2275     memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2276     predicts_[j] = temp;
2277   }
2278 #endif
2279 
2280   if (state_ < USER_DICT_LEMMA_DIRTY)
2281     state_ = USER_DICT_LEMMA_DIRTY;
2282 
2283 #ifdef ___CACHE_ENABLED___
2284   cache_init();
2285 #endif
2286 
2287   dict_info_.total_nfreq += count;
2288   return id;
2289 }
2290 }
2291