pinyin/share/userdict.cpp

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "../include/userdict.h"
#include "../include/splparser.h"
#include "../include/ngram.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifdef ___DEBUG_PERF___
#include <cutils/log.h>
#endif
#ifdef _WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include <fcntl.h>
#include <sys/stat.h>
#include <assert.h>
#include <ctype.h>
#include <sys/types.h>
#ifndef _WIN32
#include <sys/time.h>
#endif
#include <time.h>
#ifdef _WIN32
#undef max
#undef min
#include <QDateTime>
#include <QMutex>
#else
#include <pthread.h>
#endif
#include <math.h>

namespace ime_pinyin {

#ifdef _WIN32
static int gettimeofday(struct timeval *tp, void *) {
    const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch();
    tp->tv_sec = (long)(current_msecs_since_epoch / 1000);
    tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000);
    return 0;
}
#endif

#ifdef ___DEBUG_PERF___
static uint64 _ellapse_ = 0;
static struct timeval _tv_start_, _tv_end_;
#define DEBUG_PERF_BEGIN \
    do { \
      gettimeofday(&_tv_start_, NULL); \
    } while (0)
#define DEBUG_PERF_END \
    do { \
      gettimeofday(&_tv_end_, NULL); \
      _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
                  (_tv_end_.tv_usec - _tv_start_.tv_usec); \
    } while (0)
#define LOGD_PERF(message) \
    ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
#else
#define DEBUG_PERF_BEGIN
#define DEBUG_PERF_END
#define LOGD_PERF(message)
#endif

// XXX File load and write are thread-safe by g_mutex_
#ifdef _WIN32
static QMutex g_mutex_;
#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0))
#else
static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
#endif
static struct timeval g_last_update_ = {0, 0};

inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
  return (4 + info->lemma_size + (info->lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
          + (info->lemma_count << 2)
#endif
#ifdef ___SYNC_ENABLED___
          + (info->sync_count << 2)
#endif
          + sizeof(*info));
}

inline LmaScoreType UserDict::translate_score(int raw_score) {
  // 1) ori_freq: original user frequency
  uint32 ori_freq = extract_score_freq(raw_score);
  // 2) lmt_off: lmt index (week offset for example)
  uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
  if (kUserDictLMTBitWidth < 16) {
    uint64 mask = ~(1 << kUserDictLMTBitWidth);
    lmt_off &= mask;
  }
  // 3) now_off: current time index (current week offset for example)
  // assuming load_time_ is around current time
  uint64 now_off = load_time_.tv_sec;
  now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
  now_off = (now_off << (64 - kUserDictLMTBitWidth));
  now_off = (now_off >> (64 - kUserDictLMTBitWidth));
  // 4) factor: decide expand-factor
  int delta = now_off - lmt_off;
  if (delta > 4)
    delta = 4;
  int factor = 80 - (delta << 4);

  double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
  return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
                        * NGram::kLogValueAmplifier);
}

inline int UserDict::extract_score_freq(int raw_score) {
  // Frequence stored in lowest 16 bits
  int freq = (raw_score & 0x0000ffff);
  return freq;
}

inline uint64 UserDict::extract_score_lmt(int raw_score) {
  uint64 lmt = ((raw_score & 0xffff0000) >> 16);
  if (kUserDictLMTBitWidth < 16) {
    uint64 mask = ~(1 << kUserDictLMTBitWidth);
    lmt &= mask;
  }
  lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
  return lmt;
}

inline int UserDict::build_score(uint64 lmt, int freq) {
  lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
  lmt = (lmt << (64 - kUserDictLMTBitWidth));
  lmt = (lmt >> (64 - kUserDictLMTBitWidth));
  uint16 lmt16 = (uint16)lmt;
  int s = freq;
  s &= 0x0000ffff;
  s = (lmt16 << 16) | s;
  return s;
}

inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
  int64 ret = 0;
  if (len <= 0)
    return ret;

  int flag = 1;
  const uint16 * endp = s + len;
  if (*s == '-') {
    flag = -1;
    s++;
  } else if (*s == '+') {
    s++;
  }

  while (*s >= '0' && *s <= '9' && s < endp) {
    ret += ret * 10 + (*s) - '0';
    s++;
  }
  return ret * flag;
}

inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
  if (!s || size <= 0)
    return 0;
  uint16 *endp = s + size;
  int ret_len = 0;
  if (v < 0) {
    *(s++) = '-';
    ++ret_len;
    v *= -1;
  }

  uint16 *b = s;
  while (s < endp && v != 0) {
    *(s++) = '0' + (v % 10);
    v = v / 10;
    ++ret_len;
  }

  if (v != 0)
    return 0;

  --s;

  while (b < s) {
    *b = *s;
    ++b, --s;
  }

  return ret_len;
}

inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
  offset &= kUserDictOffsetMask;
  lemmas_[offset] |= flag;
}

inline char UserDict::get_lemma_flag(uint32 offset) {
  offset &= kUserDictOffsetMask;
  return (char)(lemmas_[offset]);
}

inline char UserDict::get_lemma_nchar(uint32 offset) {
  offset &= kUserDictOffsetMask;
  return (char)(lemmas_[offset + 1]);
}

inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
  offset &= kUserDictOffsetMask;
  return (uint16 *)(lemmas_ + offset + 2);
}

inline uint16 * UserDict::get_lemma_word(uint32 offset) {
  offset &= kUserDictOffsetMask;
  uint8 nchar = get_lemma_nchar(offset);
  return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
}

inline LemmaIdType UserDict::get_max_lemma_id() {
  // When a lemma is deleted, we don't not claim its id back for
  // simplicity and performance
  return start_id_ + dict_info_.lemma_count - 1;
}

inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
  if (id >= start_id_ && id <= get_max_lemma_id())
    return true;
  return false;
}

inline bool UserDict::is_valid_state() {
  if (state_ == USER_DICT_NONE)
    return false;
  return true;
}

UserDict::UserDict()
    : start_id_(0),
      version_(0),
      lemmas_(NULL),
      offsets_(NULL),
      scores_(NULL),
      ids_(NULL),
#ifdef ___PREDICT_ENABLED___
      predicts_(NULL),
#endif
#ifdef ___SYNC_ENABLED___
      syncs_(NULL),
      sync_count_size_(0),
#endif
      offsets_by_id_(NULL),
      lemma_count_left_(0),
      lemma_size_left_(0),
      dict_file_(NULL),
      state_(USER_DICT_NONE) {
  memset(&dict_info_, 0, sizeof(dict_info_));
  memset(&load_time_, 0, sizeof(load_time_));
#ifdef ___CACHE_ENABLED___
  cache_init();
#endif
}

UserDict::~UserDict() {
  close_dict();
}

bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
                         LemmaIdType end_id) {
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  dict_file_ = strdup(file_name);
  if (!dict_file_)
    return false;

  start_id_ = start_id;

  if (false == validate(file_name) && false == reset(file_name)) {
    goto error;
  }
  if (false == load(file_name, start_id)) {
    goto error;
  }

  state_ = USER_DICT_SYNC;

  gettimeofday(&load_time_, NULL);

#ifdef ___DEBUG_PERF___
  DEBUG_PERF_END;
  LOGD_PERF("load_dict");
#endif
  return true;
 error:
  free((void*)dict_file_);
  dict_file_ = NULL;
  start_id_ = 0;
  return false;
}

bool UserDict::close_dict() {
  if (state_ == USER_DICT_NONE)
    return true;
  if (state_ == USER_DICT_SYNC)
    goto out;

  // If dictionary is written back by others,
  // we can not simply write back here
  // To do a safe flush, we have to discard all newly added
  // lemmas and try to reload dict file.
  pthread_mutex_lock(&g_mutex_);
  if (load_time_.tv_sec > g_last_update_.tv_sec ||
    (load_time_.tv_sec == g_last_update_.tv_sec &&
     load_time_.tv_usec > g_last_update_.tv_usec)) {
    write_back();
    gettimeofday(&g_last_update_, NULL);
  }
  pthread_mutex_unlock(&g_mutex_);

 out:
  free((void*)dict_file_);
  free(lemmas_);
  free(offsets_);
  free(offsets_by_id_);
  free(scores_);
  free(ids_);
#ifdef ___PREDICT_ENABLED___
  free(predicts_);
#endif

  version_ = 0;
  dict_file_ = NULL;
  lemmas_ = NULL;
#ifdef ___SYNC_ENABLED___
  syncs_ = NULL;
  sync_count_size_ = 0;
#endif
  offsets_ = NULL;
  offsets_by_id_ = NULL;
  scores_ = NULL;
  ids_ = NULL;
#ifdef ___PREDICT_ENABLED___
  predicts_ = NULL;
#endif

  memset(&dict_info_, 0, sizeof(dict_info_));
  lemma_count_left_ = 0;
  lemma_size_left_ = 0;
  state_ = USER_DICT_NONE;

  return true;
}

size_t UserDict::number_of_lemmas() {
  return dict_info_.lemma_count;
}

void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
  return;
}

MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
                                      const DictExtPara *dep,
                                      LmaPsbItem *lpi_items,
                                      size_t lpi_max, size_t *lpi_num) {
  if (is_valid_state() == false)
    return 0;

  bool need_extend = false;

#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
                       lpi_items, lpi_max, &need_extend);
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_END;
  LOGD_PERF("extend_dict");
#endif
  return ((*lpi_num > 0 || need_extend) ? 1 : 0);
}

int UserDict::is_fuzzy_prefix_spell_id(
    const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
  if (len1 < searchable->splids_len)
    return 0;

  SpellingTrie &spl_trie = SpellingTrie::get_instance();
  uint32 i = 0;
  for (i = 0; i < searchable->splids_len; i++) {
    const char py1 = *spl_trie.get_spelling_str(id1[i]);
    uint16 off = 8 * (i % 4);
    const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
    if (py1 == py2)
      continue;
    return 0;
  }
  return 1;
}

int UserDict::fuzzy_compare_spell_id(
    const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
  if (len1 < searchable->splids_len)
    return -1;
  if (len1 > searchable->splids_len)
    return 1;

  SpellingTrie &spl_trie = SpellingTrie::get_instance();
  uint32 i = 0;
  for (i = 0; i < len1; i++) {
    const char py1 = *spl_trie.get_spelling_str(id1[i]);
    uint16 off = 8 * (i % 4);
    const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
    if (py1 == py2)
      continue;
    if (py1 > py2)
      return 1;
    return -1;
  }
  return 0;
}

bool UserDict::is_prefix_spell_id(
    const uint16 * fullids, uint16 fulllen,
    const UserDictSearchable *searchable) {
  if (fulllen < searchable->splids_len)
    return false;

  uint32 i = 0;
  for (; i < searchable->splids_len; i++) {
    uint16 start_id = searchable->splid_start[i];
    uint16 count = searchable->splid_count[i];
    if (fullids[i] >= start_id && fullids[i] < start_id + count)
      continue;
    else
      return false;
  }
  return true;
}

bool UserDict::equal_spell_id(
    const uint16 * fullids, uint16 fulllen,
    const UserDictSearchable *searchable) {
  if (fulllen != searchable->splids_len)
    return false;

  uint32 i = 0;
  for (; i < fulllen; i++) {
    uint16 start_id = searchable->splid_start[i];
    uint16 count = searchable->splid_count[i];
    if (fullids[i] >= start_id && fullids[i] < start_id + count)
      continue;
    else
      return false;
  }
  return true;
}

int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
  int32 begin = 0;
  int32 end = dict_info_.lemma_count - 1;
  int32 middle = -1;

  int32 first_prefix = middle;
  int32 last_matched = middle;

  while (begin <= end) {
    middle = (begin + end) >> 1;
    uint32 offset = offsets_[middle];
    uint8 nchar = get_lemma_nchar(offset);
    const uint16 * splids = get_lemma_spell_ids(offset);
    int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
    int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);

    if (pre)
      first_prefix = middle;

    if (cmp < 0) {
      begin = middle + 1;
    } else if (cmp > 0) {
      end = middle - 1;
    } else {
      end = middle - 1;
      last_matched = middle;
    }
  }

  return first_prefix;
}

void UserDict::prepare_locate(UserDictSearchable *searchable,
                             const uint16 *splid_str,
                             uint16 splid_str_len) {
  searchable->splids_len = splid_str_len;
  memset(searchable->signature, 0, sizeof(searchable->signature));

  SpellingTrie &spl_trie = SpellingTrie::get_instance();
  uint32 i = 0;
  for (; i < splid_str_len; i++) {
    if (spl_trie.is_half_id(splid_str[i])) {
      searchable->splid_count[i] =
          spl_trie.half_to_full(splid_str[i],
                                &(searchable->splid_start[i]));
    } else {
      searchable->splid_count[i] = 1;
      searchable->splid_start[i] = splid_str[i];
    }
    const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
    searchable->signature[i>>2] |= (py << (8 * (i % 4)));
  }
}

size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
                          LmaPsbItem *lpi_items, size_t lpi_max) {
  return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
}

size_t UserDict::_get_lpis(const uint16 *splid_str,
                           uint16 splid_str_len, LmaPsbItem *lpi_items,
                           size_t lpi_max, bool * need_extend) {
  bool tmp_extend;
  if (!need_extend)
    need_extend = &tmp_extend;

  *need_extend = false;

  if (is_valid_state() == false)
    return 0;
  if (lpi_max <= 0)
    return 0;

  if (0 == pthread_mutex_trylock(&g_mutex_)) {
    if (load_time_.tv_sec < g_last_update_.tv_sec ||
      (load_time_.tv_sec == g_last_update_.tv_sec &&
       load_time_.tv_usec < g_last_update_.tv_usec)) {
      // Others updated disk file, have to reload
      pthread_mutex_unlock(&g_mutex_);
      flush_cache();
    } else {
      pthread_mutex_unlock(&g_mutex_);
    }
  } else {
  }

  UserDictSearchable searchable;
  prepare_locate(&searchable, splid_str, splid_str_len);

  uint32 max_off = dict_info_.lemma_count;
#ifdef ___CACHE_ENABLED___
  int32 middle;
  uint32 start, count;
  bool cached = cache_hit(&searchable, &start, &count);
  if (cached) {
    middle = start;
    max_off = start + count;
  } else {
    middle = locate_first_in_offsets(&searchable);
    start = middle;
  }
#else
  int32 middle = locate_first_in_offsets(&searchable);
#endif

  if (middle == -1) {
#ifdef ___CACHE_ENABLED___
    if (!cached)
      cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
#endif
    return 0;
  }

  size_t lpi_current = 0;

  bool fuzzy_break = false;
  bool prefix_break = false;
  while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
    if (lpi_current >= lpi_max)
      break;
    uint32 offset = offsets_[middle];
    // Ignore deleted lemmas
    if (offset & kUserDictOffsetFlagRemove) {
      middle++;
      continue;
    }
    uint8 nchar = get_lemma_nchar(offset);
    uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
    if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#else
    if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
#endif
      fuzzy_break = true;
    }

    if (prefix_break == false) {
      if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
        if (*need_extend == false &&
            is_prefix_spell_id(splids, nchar, &searchable)) {
          *need_extend = true;
        }
      } else {
        prefix_break = true;
      }
    }

    if (equal_spell_id(splids, nchar, &searchable) == true) {
      lpi_items[lpi_current].psb = translate_score(scores_[middle]);
      lpi_items[lpi_current].id = ids_[middle];
      lpi_items[lpi_current].lma_len = nchar;
      lpi_current++;
    }
    middle++;
  }

#ifdef ___CACHE_ENABLED___
  if (!cached) {
    count = middle - start;
    cache_push(USER_DICT_CACHE, &searchable, start, count);
  }
#endif

  return lpi_current;
}

uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
                               uint16 str_max) {
  if (is_valid_state() == false)
    return 0;
  if (is_valid_lemma_id(id_lemma) == false)
    return 0;
  uint32 offset = offsets_by_id_[id_lemma - start_id_];
  uint8 nchar = get_lemma_nchar(offset);
  char16 * str = get_lemma_word(offset);
  uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
  int i = 0;
  for (; i < m; i++) {
    str_buf[i] = str[i];
  }
  str_buf[i] = 0;
  return m;
}

uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
                                  uint16 splids_max, bool arg_valid) {
  if (is_valid_lemma_id(id_lemma) == false)
    return 0;
  uint32 offset = offsets_by_id_[id_lemma - start_id_];
  uint8 nchar = get_lemma_nchar(offset);
  const uint16 * ids = get_lemma_spell_ids(offset);
  int i = 0;
  for (; i < nchar && i < splids_max; i++)
    splids[i] = ids[i];
  return i;
}

size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
                         NPredictItem *npre_items, size_t npre_max,
                         size_t b4_used) {
  uint32 new_added = 0;
#ifdef ___PREDICT_ENABLED___
  int32 end = dict_info_.lemma_count - 1;
  int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
  if (j == -1)
    return 0;

  while (j <= end) {
    uint32 offset = predicts_[j];
    // Ignore deleted lemmas
    if (offset & kUserDictOffsetFlagRemove) {
      j++;
      continue;
    }
    uint32 nchar = get_lemma_nchar(offset);
    uint16 * words = get_lemma_word(offset);
    uint16 * splids = get_lemma_spell_ids(offset);

    if (nchar <= hzs_len) {
      j++;
      continue;
    }

    if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
      if (new_added >= npre_max) {
        return new_added;
      }
      uint32 cpy_len =
          (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
          - (hzs_len << 1);
      npre_items[new_added].his_len = hzs_len;
      npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
      memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
      if ((cpy_len >> 1) < kMaxPredictSize) {
        npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
      }
      new_added++;
    } else {
      break;
    }

    j++;
  }
#endif
  return new_added;
}

int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
                                  uint16 lemma_len) {
  int32 max_off = dict_info_.lemma_count;

  UserDictSearchable searchable;
  prepare_locate(&searchable, splid_str, lemma_len);
#ifdef ___CACHE_ENABLED___
  int32 off;
  uint32 start, count;
  bool cached = load_cache(&searchable, &start, &count);
  if (cached) {
    off = start;
    max_off = start + count;
  } else {
    off = locate_first_in_offsets(&searchable);
    start = off;
  }
#else
  int32 off = locate_first_in_offsets(&searchable);
#endif

  if (off == -1) {
    return off;
  }

  while (off < max_off) {
    uint32 offset = offsets_[off];
    if (offset & kUserDictOffsetFlagRemove) {
      off++;
      continue;
    }
    uint16 * splids = get_lemma_spell_ids(offset);
#ifdef ___CACHE_ENABLED___
    if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
      break;
#else
    if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
      break;
#endif
    if (equal_spell_id(splids, lemma_len, &searchable) == true) {
      uint16 * str = get_lemma_word(offset);
      uint32 i = 0;
      for (i = 0; i < lemma_len; i++) {
        if (str[i] == lemma_str[i])
          continue;
        break;
      }
      if (i < lemma_len) {
        off++;
        continue;
      }
#ifdef ___CACHE_ENABLED___
      // No need to save_cache here, since current function is invoked by
      // put_lemma. It's rarely possible for a user input same lemma twice.
      // That means first time user type a new lemma, it is newly added into
      // user dictionary, then it's possible that user type the same lemma
      // again.
      // Another reason save_cache can not be invoked here is this function
      // aborts when lemma is found, and it never knows the count.
#endif
      return off;
    }
    off++;
  }

  return -1;
}

#ifdef ___PREDICT_ENABLED___
uint32 UserDict::locate_where_to_insert_in_predicts(
    const uint16 * words, int lemma_len) {
  int32 begin = 0;
  int32 end = dict_info_.lemma_count - 1;
  int32 middle = end;

  uint32 last_matched = middle;

  while (begin <= end) {
    middle = (begin + end) >> 1;
    uint32 offset = offsets_[middle];
    uint8 nchar = get_lemma_nchar(offset);
    const uint16 * ws = get_lemma_word(offset);

    uint32 minl = nchar < lemma_len ? nchar : lemma_len;
    uint32 k = 0;
    int cmp = 0;

    for (; k < minl; k++) {
      if (ws[k] < words[k]) {
        cmp = -1;
        break;
      } else if (ws[k] > words[k]) {
        cmp = 1;
        break;
      }
    }
    if (cmp == 0) {
      if (nchar < lemma_len)
        cmp = -1;
      else if (nchar > lemma_len)
        cmp = 1;
    }

    if (cmp < 0) {
      begin = middle + 1;
      last_matched = middle;
    } else if (cmp > 0) {
      end = middle - 1;
    } else {
      end = middle - 1;
      last_matched = middle;
    }
  }

  return last_matched;
}

int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
  int32 begin = 0;
  int32 end = dict_info_.lemma_count - 1;
  int32 middle = -1;

  int32 last_matched = middle;

  while (begin <= end) {
    middle = (begin + end) >> 1;
    uint32 offset = offsets_[middle];
    uint8 nchar = get_lemma_nchar(offset);
    const uint16 * ws = get_lemma_word(offset);

    uint32 minl = nchar < lemma_len ? nchar : lemma_len;
    uint32 k = 0;
    int cmp = 0;

    for (; k < minl; k++) {
      if (ws[k] < words[k]) {
        cmp = -1;
        break;
      } else if (ws[k] > words[k]) {
        cmp = 1;
        break;
      }
    }
    if (cmp == 0) {
      if (nchar >= lemma_len)
        last_matched = middle;
      if (nchar < lemma_len)
        cmp = -1;
      else if (nchar > lemma_len)
        cmp = 1;
    }

    if (cmp < 0) {
      begin = middle + 1;
    } else if (cmp > 0) {
      end = middle - 1;
    } else {
      end = middle - 1;
    }
  }

  return last_matched;
}

#endif

LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
                                   uint16 lemma_len) {
  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
  if (off == -1) {
    return 0;
  }

  return ids_[off];
}

LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
  if (is_valid_state() == false)
    return 0;
  if (is_valid_lemma_id(lemma_id) == false)
    return 0;

  return translate_score(_get_lemma_score(lemma_id));
}

LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
                                uint16 lemma_len) {
  if (is_valid_state() == false)
    return 0;
  return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
}

int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
  if (is_valid_state() == false)
    return 0;
  if (is_valid_lemma_id(lemma_id) == false)
    return 0;

  uint32 offset = offsets_by_id_[lemma_id - start_id_];

  uint32 nchar = get_lemma_nchar(offset);
  uint16 * spl = get_lemma_spell_ids(offset);
  uint16 * wrd = get_lemma_word(offset);

  int32 off = locate_in_offsets(wrd, spl, nchar);
  if (off == -1) {
    return 0;
  }

  return scores_[off];
}

int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
                                uint16 lemma_len) {
  if (is_valid_state() == false)
    return 0;

  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
  if (off == -1) {
    return 0;
  }

  return scores_[off];
}

#ifdef ___SYNC_ENABLED___
void UserDict::remove_lemma_from_sync_list(uint32 offset) {
  offset &= kUserDictOffsetMask;
  uint32 i = 0;
  for (; i < dict_info_.sync_count; i++) {
    unsigned int off = (syncs_[i] & kUserDictOffsetMask);
    if (off == offset)
      break;
  }
  if (i < dict_info_.sync_count) {
    syncs_[i] = syncs_[dict_info_.sync_count - 1];
    dict_info_.sync_count--;
  }
}
#endif

#ifdef ___PREDICT_ENABLED___
void UserDict::remove_lemma_from_predict_list(uint32 offset) {
  offset &= kUserDictOffsetMask;
  uint32 i = 0;
  for (; i < dict_info_.lemma_count; i++) {
    unsigned int off = (predicts_[i] & kUserDictOffsetMask);
    if (off == offset) {
      predicts_[i] |= kUserDictOffsetFlagRemove;
      break;
    }
  }
}
#endif

bool UserDict::remove_lemma_by_offset_index(int offset_index) {
  if (is_valid_state() == false)
    return 0;

  int32 off = offset_index;
  if (off == -1) {
    return false;
  }

  uint32 offset = offsets_[off];
  uint32 nchar = get_lemma_nchar(offset);

  offsets_[off] |= kUserDictOffsetFlagRemove;

#ifdef ___SYNC_ENABLED___
  // Remove corresponding sync item
  remove_lemma_from_sync_list(offset);
#endif

#ifdef ___PREDICT_ENABLED___
  remove_lemma_from_predict_list(offset);
#endif
  dict_info_.free_count++;
  dict_info_.free_size += (2 + (nchar << 2));

  if (state_ < USER_DICT_OFFSET_DIRTY)
    state_ = USER_DICT_OFFSET_DIRTY;
  return true;
}

bool UserDict::remove_lemma(LemmaIdType lemma_id) {
  if (is_valid_state() == false)
    return 0;
  if (is_valid_lemma_id(lemma_id) == false)
    return false;
  uint32 offset = offsets_by_id_[lemma_id - start_id_];

  uint32 nchar = get_lemma_nchar(offset);
  uint16 * spl = get_lemma_spell_ids(offset);
  uint16 * wrd = get_lemma_word(offset);

  int32 off = locate_in_offsets(wrd, spl, nchar);

  return remove_lemma_by_offset_index(off);
}

void UserDict::flush_cache() {
  LemmaIdType start_id = start_id_;
  if (!dict_file_)
    return;
  const char * file = strdup(dict_file_);
  if (!file)
    return;
  close_dict();
  load_dict(file, start_id, kUserDictIdEnd);
  free((void*)file);
#ifdef ___CACHE_ENABLED___
  cache_init();
#endif
  return;
}

bool UserDict::reset(const char *file) {
  FILE *fp = fopen(file, "w+");
  if (!fp) {
    return false;
  }
  uint32 version = kUserDictVersion;
  size_t wred = fwrite(&version, 1, 4, fp);
  UserDictInfo info;
  memset(&info, 0, sizeof(info));
  // By default, no limitation for lemma count and size
  // thereby, reclaim_ratio is never used
  wred += fwrite(&info, 1, sizeof(info), fp);
  if (wred != sizeof(info) + sizeof(version)) {
    fclose(fp);
    unlink(file);
    return false;
  }
  fclose(fp);
  return true;
}

bool UserDict::validate(const char *file) {
  // b is ignored in POSIX compatible os including Linux
  // while b is important flag for Windows to specify binary mode
  FILE *fp = fopen(file, "rb");
  if (!fp) {
    return false;
  }

  size_t size;
  size_t readed;
  uint32 version;
  UserDictInfo dict_info;

  // validate
  int err = fseek(fp, 0, SEEK_END);
  if (err) {
    goto error;
  }

  size = ftell(fp);
  if (size < 4 + sizeof(dict_info)) {
    goto error;
  }

  err = fseek(fp, 0, SEEK_SET);
  if (err) {
    goto error;
  }

  readed = fread(&version, 1, sizeof(version), fp);
  if (readed < sizeof(version)) {
    goto error;
  }
  if (version != kUserDictVersion) {
    goto error;
  }

  err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
  if (err) {
    goto error;
  }

  readed = fread(&dict_info, 1, sizeof(dict_info), fp);
  if (readed != sizeof(dict_info)) {
    goto error;
  }

  if (size != get_dict_file_size(&dict_info)) {
    goto error;
  }

  fclose(fp);
  return true;

 error:
  fclose(fp);
  return false;
}

bool UserDict::load(const char *file, LemmaIdType start_id) {
  if (0 != pthread_mutex_trylock(&g_mutex_)) {
    return false;
  }
  // b is ignored in POSIX compatible os including Linux
  // while b is important flag for Windows to specify binary mode
  FILE *fp = fopen(file, "rb");
  if (!fp) {
    pthread_mutex_unlock(&g_mutex_);
    return false;
  }

  size_t readed, toread;
  UserDictInfo dict_info;
  uint8 *lemmas = NULL;
  uint32 *offsets = NULL;
#ifdef ___SYNC_ENABLED___
  uint32 *syncs = NULL;
#endif
  uint32 *scores = NULL;
  uint32 *ids = NULL;
  uint32 *offsets_by_id = NULL;
#ifdef ___PREDICT_ENABLED___
  uint32 *predicts = NULL;
#endif
  size_t i;
  int err;

  err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
  if (err) goto error;

  readed = fread(&dict_info, 1, sizeof(dict_info), fp);
  if (readed != sizeof(dict_info)) goto error;

  lemmas = (uint8 *)malloc(
      dict_info.lemma_size +
      (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));

  if (!lemmas) goto error;

  offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
  if (!offsets) goto error;

#ifdef ___PREDICT_ENABLED___
  predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
  if (!predicts) goto error;
#endif

#ifdef ___SYNC_ENABLED___
  syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
  if (!syncs) goto error;
#endif

  scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
  if (!scores) goto error;

  ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
  if (!ids) goto error;

  offsets_by_id = (uint32 *)malloc(
      (dict_info.lemma_count + kUserDictPreAlloc) << 2);
  if (!offsets_by_id) goto error;

  err = fseek(fp, 4, SEEK_SET);
  if (err) goto error;

  readed = 0;
  while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
    readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
  }
  if (readed < dict_info.lemma_size)
    goto error;

  toread = (dict_info.lemma_count << 2);
  readed = 0;
  while (readed < toread && !ferror(fp) && !feof(fp)) {
    readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
  }
  if (readed < toread)
    goto error;

#ifdef ___PREDICT_ENABLED___
  toread = (dict_info.lemma_count << 2);
  readed = 0;
  while (readed < toread && !ferror(fp) && !feof(fp)) {
    readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
  }
  if (readed < toread)
    goto error;
#endif

  readed = 0;
  while (readed < toread && !ferror(fp) && !feof(fp)) {
    readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
  }
  if (readed < toread)
    goto error;

#ifdef ___SYNC_ENABLED___
  toread = (dict_info.sync_count << 2);
  readed = 0;
  while (readed < toread && !ferror(fp) && !feof(fp)) {
    readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
  }
  if (readed < toread)
    goto error;
#endif

  for (i = 0; i < dict_info.lemma_count; i++) {
    ids[i] = start_id + i;
    offsets_by_id[i] = offsets[i];
  }

  lemmas_ = lemmas;
  offsets_ = offsets;
#ifdef ___SYNC_ENABLED___
  syncs_ = syncs;
  sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
#endif
  offsets_by_id_ = offsets_by_id;
  scores_ = scores;
  ids_ = ids;
#ifdef ___PREDICT_ENABLED___
  predicts_ = predicts;
#endif
  lemma_count_left_ = kUserDictPreAlloc;
  lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
  memcpy(&dict_info_, &dict_info, sizeof(dict_info));
  state_ = USER_DICT_SYNC;

  fclose(fp);

  pthread_mutex_unlock(&g_mutex_);
  return true;

 error:
  if (lemmas) free(lemmas);
  if (offsets) free(offsets);
#ifdef ___SYNC_ENABLED___
  if (syncs) free(syncs);
#endif
  if (scores) free(scores);
  if (ids) free(ids);
  if (offsets_by_id) free(offsets_by_id);
#ifdef ___PREDICT_ENABLED___
  if (predicts) free(predicts);
#endif
  fclose(fp);
  pthread_mutex_unlock(&g_mutex_);
  return false;
}

void UserDict::write_back() {
  // XXX write back is only allowed from close_dict due to thread-safe sake
  if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
    return;
  int fd = open(dict_file_, O_WRONLY);
  if (fd == -1)
    return;
  switch (state_) {
    case USER_DICT_DEFRAGMENTED:
      write_back_all(fd);
      break;
    case USER_DICT_LEMMA_DIRTY:
      write_back_lemma(fd);
      break;
    case USER_DICT_OFFSET_DIRTY:
      write_back_offset(fd);
      break;
    case USER_DICT_SCORE_DIRTY:
      write_back_score(fd);
      break;
#ifdef ___SYNC_ENABLED___
    case USER_DICT_SYNC_DIRTY:
      write_back_sync(fd);
      break;
#endif
    default:
      break;
  }
  // It seems truncate is not need on Linux, Windows except Mac
  // I am doing it here anyway for safety.
  off_t cur = lseek(fd, 0, SEEK_CUR);
#ifndef _WIN32
  ftruncate(fd, cur);
#endif
  close(fd);
  state_ = USER_DICT_SYNC;
}

#ifdef ___SYNC_ENABLED___
void UserDict::write_back_sync(int fd) {
  int err = lseek(fd, 4 + dict_info_.lemma_size
                  + (dict_info_.lemma_count << 3)
#ifdef ___PREDICT_ENABLED___
                  + (dict_info_.lemma_count << 2)
#endif
                  , SEEK_SET);
  if (err == -1)
    return;
  write(fd, syncs_, dict_info_.sync_count << 2);
  write(fd, &dict_info_, sizeof(dict_info_));
}
#endif

void UserDict::write_back_offset(int fd) {
  int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
  if (err == -1)
    return;
  write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
  write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
  write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
  write(fd, syncs_, dict_info_.sync_count << 2);
#endif
  write(fd, &dict_info_, sizeof(dict_info_));
}

void UserDict::write_back_score(int fd) {
  int err = lseek(fd, 4 + dict_info_.lemma_size
                  + (dict_info_.lemma_count << 2)
#ifdef ___PREDICT_ENABLED___
                  + (dict_info_.lemma_count << 2)
#endif
                  , SEEK_SET);
  if (err == -1)
    return;
  write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
  write(fd, syncs_, dict_info_.sync_count << 2);
#endif
  write(fd, &dict_info_, sizeof(dict_info_));
}

void UserDict::write_back_lemma(int fd) {
  int err = lseek(fd, 4, SEEK_SET);
  if (err == -1)
    return;
  // New lemmas are always appended, no need to write whole lemma block
  size_t need_write = kUserDictPreAlloc *
      (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
  err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
  if (err == -1)
    return;
  write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);

  write(fd, offsets_,  dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
  write(fd, predicts_,  dict_info_.lemma_count << 2);
#endif
  write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
  write(fd, syncs_, dict_info_.sync_count << 2);
#endif
  write(fd, &dict_info_, sizeof(dict_info_));
}

void UserDict::write_back_all(int fd) {
  // XXX lemma_size is handled differently in writeall
  // and writelemma. I update lemma_size and lemma_count in different
  // places for these two cases. Should fix it to make it consistent.
  int err = lseek(fd, 4, SEEK_SET);
  if (err == -1)
    return;
  write(fd, lemmas_, dict_info_.lemma_size);
  write(fd, offsets_, dict_info_.lemma_count << 2);
#ifdef ___PREDICT_ENABLED___
  write(fd, predicts_, dict_info_.lemma_count << 2);
#endif
  write(fd, scores_, dict_info_.lemma_count << 2);
#ifdef ___SYNC_ENABLED___
  write(fd, syncs_, dict_info_.sync_count << 2);
#endif
  write(fd, &dict_info_, sizeof(dict_info_));
}

#ifdef ___CACHE_ENABLED___
bool UserDict::load_cache(UserDictSearchable *searchable,
                          uint32 *offset, uint32 *length) {
  UserDictCache *cache = &caches_[searchable->splids_len - 1];
  if (cache->head == cache->tail)
    return false;

  uint16 j, sig_len = kMaxLemmaSize / 4;
  uint16 i = cache->head;
  while (1) {
    j = 0;
    for (; j < sig_len; j++) {
      if (cache->signatures[i][j] != searchable->signature[j])
        break;
    }
    if (j < sig_len) {
      i++;
      if (i >= kUserDictCacheSize)
        i -= kUserDictCacheSize;
      if (i == cache->tail)
        break;
      continue;
    }
    *offset = cache->offsets[i];
    *length = cache->lengths[i];
    return true;
  }
  return false;
}

void UserDict::save_cache(UserDictSearchable *searchable,
                          uint32 offset, uint32 length) {
  UserDictCache *cache = &caches_[searchable->splids_len - 1];
  uint16 next = cache->tail;

  cache->offsets[next] = offset;
  cache->lengths[next] = length;
  uint16 sig_len = kMaxLemmaSize / 4;
  uint16 j = 0;
  for (; j < sig_len; j++) {
    cache->signatures[next][j] = searchable->signature[j];
  }

  if (++next >= kUserDictCacheSize) {
    next -= kUserDictCacheSize;
  }
  if (next == cache->head) {
    cache->head++;
    if (cache->head >= kUserDictCacheSize) {
      cache->head -= kUserDictCacheSize;
    }
  }
  cache->tail = next;
}

void UserDict::reset_cache() {
  memset(caches_, 0, sizeof(caches_));
}

bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
  UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
  if (cache->head == cache->tail)
    return false;

  uint16 j, sig_len = kMaxLemmaSize / 4;
  uint16 i = cache->head;
  while (1) {
    j = 0;
    for (; j < sig_len; j++) {
      if (cache->signatures[i][j] != searchable->signature[j])
        break;
    }
    if (j < sig_len) {
      i++;
      if (i >= kUserDictMissCacheSize)
        i -= kUserDictMissCacheSize;
      if (i == cache->tail)
        break;
      continue;
    }
    return true;
  }
  return false;
}

void UserDict::save_miss_cache(UserDictSearchable *searchable) {
  UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
  uint16 next = cache->tail;

  uint16 sig_len = kMaxLemmaSize / 4;
  uint16 j = 0;
  for (; j < sig_len; j++) {
    cache->signatures[next][j] = searchable->signature[j];
  }

  if (++next >= kUserDictMissCacheSize) {
    next -= kUserDictMissCacheSize;
  }
  if (next == cache->head) {
    cache->head++;
    if (cache->head >= kUserDictMissCacheSize) {
      cache->head -= kUserDictMissCacheSize;
    }
  }
  cache->tail = next;
}

void UserDict::reset_miss_cache() {
  memset(miss_caches_, 0, sizeof(miss_caches_));
}

void UserDict::cache_init() {
  reset_cache();
  reset_miss_cache();
}

bool UserDict::cache_hit(UserDictSearchable *searchable,
                         uint32 *offset, uint32 *length) {
  bool hit = load_miss_cache(searchable);
  if (hit) {
    *offset = 0;
    *length = 0;
    return true;
  }
  hit = load_cache(searchable, offset, length);
  if (hit) {
    return true;
  }
  return false;
}

void UserDict::cache_push(UserDictCacheType type,
                         UserDictSearchable *searchable,
                         uint32 offset, uint32 length) {
  switch (type) {
    case USER_DICT_MISS_CACHE:
      save_miss_cache(searchable);
      break;
    case USER_DICT_CACHE:
      save_cache(searchable, offset, length);
      break;
    default:
      break;
  }
}

#endif

void UserDict::defragment(void) {
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  if (is_valid_state() == false)
    return;
  // Fixup offsets_, set REMOVE flag to lemma's flag if needed
  size_t first_freed = 0;
  size_t first_inuse = 0;
  while (first_freed < dict_info_.lemma_count) {
    // Find first freed offset
    while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
            first_freed < dict_info_.lemma_count) {
      first_freed++;
    }
    if (first_freed < dict_info_.lemma_count) {
      // Save REMOVE flag to lemma flag
      int off = offsets_[first_freed];
      set_lemma_flag(off, kUserDictLemmaFlagRemove);
    } else {
      break;
    }
    // Find first inuse offse after first_freed
    first_inuse = first_freed + 1;
    while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
           (first_inuse < dict_info_.lemma_count)) {
      // Save REMOVE flag to lemma flag
      int off = offsets_[first_inuse];
      set_lemma_flag(off, kUserDictLemmaFlagRemove);
      first_inuse++;
    }
    if (first_inuse >= dict_info_.lemma_count) {
      break;
    }
    // Swap offsets_
    int tmp = offsets_[first_inuse];
    offsets_[first_inuse] = offsets_[first_freed];
    offsets_[first_freed] = tmp;
    // Move scores_, no need to swap
    tmp = scores_[first_inuse];
    scores_[first_inuse] = scores_[first_freed];
    scores_[first_freed] = tmp;
    // Swap ids_
    LemmaIdType tmpid = ids_[first_inuse];
    ids_[first_inuse] = ids_[first_freed];
    ids_[first_freed] = tmpid;
    // Go on
    first_freed++;
  }
#ifdef ___PREDICT_ENABLED___
  // Fixup predicts_
  first_freed = 0;
  first_inuse = 0;
  while (first_freed < dict_info_.lemma_count) {
    // Find first freed offset
    while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
            first_freed < dict_info_.lemma_count) {
      first_freed++;
    }
    if (first_freed >= dict_info_.lemma_count)
      break;
    // Find first inuse offse after first_freed
    first_inuse = first_freed + 1;
    while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
           && (first_inuse < dict_info_.lemma_count)) {
      first_inuse++;
    }
    if (first_inuse >= dict_info_.lemma_count) {
      break;
    }
    // Swap offsets_
    int tmp = predicts_[first_inuse];
    predicts_[first_inuse] = predicts_[first_freed];
    predicts_[first_freed] = tmp;
    // Go on
    first_freed++;
  }
#endif
  dict_info_.lemma_count = first_freed;
  // Fixup lemmas_
  size_t begin = 0;
  size_t end = 0;
  size_t dst = 0;
  int total_size = dict_info_.lemma_size + lemma_size_left_;
  int total_count = dict_info_.lemma_count + lemma_count_left_;
  size_t real_size = total_size - lemma_size_left_;
  while (dst < real_size) {
    unsigned char flag = get_lemma_flag(dst);
    unsigned char nchr = get_lemma_nchar(dst);
    if ((flag & kUserDictLemmaFlagRemove) == 0) {
      dst += nchr * 4 + 2;
      continue;
    }
    break;
  }
  if (dst >= real_size)
    return;

  end = dst;
  while (end < real_size) {
    begin = end + get_lemma_nchar(end) * 4 + 2;
 repeat:
    // not used any more
    if (begin >= real_size)
      break;
    unsigned char flag = get_lemma_flag(begin);
    unsigned char nchr = get_lemma_nchar(begin);
    if (flag & kUserDictLemmaFlagRemove) {
      begin += nchr * 4 + 2;
      goto repeat;
    }
    end = begin + nchr * 4 + 2;
    while (end < real_size) {
      unsigned char eflag = get_lemma_flag(end);
      unsigned char enchr = get_lemma_nchar(end);
      if ((eflag & kUserDictLemmaFlagRemove) == 0) {
        end += enchr * 4 + 2;
        continue;
      }
      break;
    }
    memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
    for (size_t j = 0; j < dict_info_.lemma_count; j++) {
      if (offsets_[j] >= begin && offsets_[j] < end) {
        offsets_[j] -= (begin - dst);
        offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
      }
#ifdef ___PREDICT_ENABLED___
      if (predicts_[j] >= begin && predicts_[j] < end) {
        predicts_[j] -= (begin - dst);
      }
#endif
    }
#ifdef ___SYNC_ENABLED___
    for (size_t j = 0; j < dict_info_.sync_count; j++) {
      if (syncs_[j] >= begin && syncs_[j] < end) {
        syncs_[j] -= (begin - dst);
      }
    }
#endif
    dst += (end - begin);
  }

  dict_info_.free_count = 0;
  dict_info_.free_size = 0;
  dict_info_.lemma_size = dst;
  lemma_size_left_ = total_size - dict_info_.lemma_size;
  lemma_count_left_ = total_count - dict_info_.lemma_count;

  // XXX Without following code,
  // offsets_by_id_ is not reordered.
  // That's to say, all removed lemmas' ids are not collected back.
  // There may not be room for addition of new lemmas due to
  // offsests_by_id_ reason, although lemma_size_left_ is fixed.
  // By default, we do want defrag as fast as possible, because
  // during defrag procedure, other peers can not write new lemmas
  // to user dictionary file.
  // XXX If write-back is invoked immediately after
  // this defragment, no need to fix up following in-mem data.
  for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
    ids_[i] = start_id_ + i;
    offsets_by_id_[i] = offsets_[i];
  }

  state_ = USER_DICT_DEFRAGMENTED;

#ifdef ___DEBUG_PERF___
  DEBUG_PERF_END;
  LOGD_PERF("defragment");
#endif
}

#ifdef ___SYNC_ENABLED___
void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
  if (is_valid_state() == false)
    return;
  if (end > dict_info_.sync_count)
    end = dict_info_.sync_count;
  memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
  dict_info_.sync_count -= (end - start);
  if (state_ < USER_DICT_SYNC_DIRTY)
    state_ = USER_DICT_SYNC_DIRTY;
}

int UserDict::get_sync_count() {
  if (is_valid_state() == false)
    return 0;
  return dict_info_.sync_count;
}

LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
                        uint16 lemma_len, uint16 count, uint64 lmt) {
  int again = 0;
 begin:
  LemmaIdType id;
  uint32 * syncs_bak = syncs_;
  syncs_ = NULL;
  id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
  syncs_ = syncs_bak;
  if (id == 0 && again == 0) {
    if ((dict_info_.limit_lemma_count > 0 &&
        dict_info_.lemma_count >= dict_info_.limit_lemma_count)
        || (dict_info_.limit_lemma_size > 0 &&
            dict_info_.lemma_size + (2 + (lemma_len << 2))
            > dict_info_.limit_lemma_size)) {
      // XXX Always reclaim and defrag in sync code path
      //     sync thread is background thread and ok with heavy work
      reclaim();
      defragment();
      flush_cache();
      again = 1;
      goto begin;
    }
  }
  return id;
}

int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
  int newly_added = 0;

  SpellingParser * spl_parser = new SpellingParser();
  if (!spl_parser) {
    return 0;
  }
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  char16 *ptr = lemmas;

  // Extract pinyin,words,frequence,last_mod_time
  char16 * p = ptr, * py16 = ptr;
  char16 * hz16 = NULL;
  int py16_len = 0;
  uint16 splid[kMaxLemmaSize];
  int splid_len = 0;
  int hz16_len = 0;
  char16 * fr16 = NULL;
  int fr16_len = 0;

  while (p - ptr < len) {
    // Pinyin
    py16 = p;
    splid_len = 0;
    while (*p != 0x2c && (p - ptr) < len) {
      if (*p == 0x20)
        splid_len++;
      p++;
    }
    splid_len++;
    if (p - ptr == len)
      break;
    py16_len = p - py16;
    if (kMaxLemmaSize < splid_len) {
      break;
    }
    bool is_pre;
    int splidl = spl_parser->splstr16_to_idxs_f(
        py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
    if (splidl != splid_len)
      break;
    // Phrase
    hz16 = ++p;
    while (*p != 0x2c && (p - ptr) < len) {
      p++;
    }
    hz16_len = p - hz16;
    if (hz16_len != splid_len)
      break;
    // Frequency
    fr16 = ++p;
    fr16_len = 0;
    while (*p != 0x2c && (p - ptr) < len) {
      p++;
    }
    fr16_len = p - fr16;
    uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
    // Last modified time
    fr16 = ++p;
    fr16_len = 0;
    while (*p != 0x3b && (p - ptr) < len) {
      p++;
    }
    fr16_len = p - fr16;
    uint64 last_mod = utf16le_atoll(fr16, fr16_len);

    put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
    newly_added++;

    p++;
  }

#ifdef ___DEBUG_PERF___
  DEBUG_PERF_END;
  LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
#endif
  return newly_added;
}

int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
    char16 * str, int size, int * count) {
  int len = 0;
  *count = 0;

  int left_len = size;

  if (is_valid_state() == false)
    return len;

  SpellingTrie * spl_trie = &SpellingTrie::get_instance();
  if (!spl_trie) {
    return 0;
  }

  uint32 i;
  for (i = 0; i < dict_info_.sync_count; i++) {
    int offset = syncs_[i];
    uint32 nchar = get_lemma_nchar(offset);
    uint16 *spl = get_lemma_spell_ids(offset);
    uint16 *wrd = get_lemma_word(offset);
    int score = _get_lemma_score(wrd, spl, nchar);

    static char score_temp[32], *pscore_temp = score_temp;
    static char16 temp[256], *ptemp = temp;

    pscore_temp = score_temp;
    ptemp = temp;

    uint32 j;
    // Add pinyin
    for (j = 0; j < nchar; j++) {
      int ret_len = spl_trie->get_spelling_str16(
          spl[j], ptemp, temp + sizeof(temp) - ptemp);
      if (ret_len <= 0)
        break;
      ptemp += ret_len;
      if (ptemp < temp + sizeof(temp) - 1) {
        *(ptemp++) = ' ';
      } else {
        j = 0;
        break;
      }
    }
    if (j < nchar) {
      continue;
    }
    ptemp--;
    if (ptemp < temp + sizeof(temp) - 1) {
      *(ptemp++) = ',';
    } else {
      continue;
    }
    // Add phrase
    for (j = 0; j < nchar; j++) {
      if (ptemp < temp + sizeof(temp) - 1) {
        *(ptemp++) = wrd[j];
      } else {
        break;
      }
    }
    if (j < nchar) {
      continue;
    }
    if (ptemp < temp + sizeof(temp) - 1) {
      *(ptemp++) = ',';
    } else {
      continue;
    }
    // Add frequency
    uint32 intf = extract_score_freq(score);
    int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
    if (ret_len <= 0)
      continue;
    ptemp += ret_len;
    if (ptemp < temp + sizeof(temp) - 1) {
      *(ptemp++) = ',';
    } else {
      continue;
    }
    // Add last modified time
    uint64 last_mod = extract_score_lmt(score);
    ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
    if (ret_len <= 0)
      continue;
    ptemp += ret_len;
    if (ptemp < temp + sizeof(temp) - 1) {
      *(ptemp++) = ';';
    } else {
      continue;
    }

    // Write to string
    int need_len = ptemp - temp;
    if (need_len > left_len)
      break;
    memcpy(str + len, temp, need_len * 2);
    left_len -= need_len;

    len += need_len;
    (*count)++;
  }

  if (len > 0) {
    if (state_ < USER_DICT_SYNC_DIRTY)
      state_ = USER_DICT_SYNC_DIRTY;
  }
  return len;
}

#endif

bool UserDict::state(UserDictStat * stat) {
  if (is_valid_state() == false)
    return false;
  if (!stat)
    return false;
  stat->version = version_;
  stat->file_name = dict_file_;
  stat->load_time.tv_sec = load_time_.tv_sec;
  stat->load_time.tv_usec = load_time_.tv_usec;
  pthread_mutex_lock(&g_mutex_);
  stat->last_update.tv_sec = g_last_update_.tv_sec;
  stat->last_update.tv_usec = g_last_update_.tv_usec;
  pthread_mutex_unlock(&g_mutex_);
  stat->disk_size = get_dict_file_size(&dict_info_);
  stat->lemma_count = dict_info_.lemma_count;
  stat->lemma_size = dict_info_.lemma_size;
  stat->delete_count = dict_info_.free_count;
  stat->delete_size = dict_info_.free_size;
#ifdef ___SYNC_ENABLED___
  stat->sync_count = dict_info_.sync_count;
#endif
  stat->limit_lemma_count = dict_info_.limit_lemma_count;
  stat->limit_lemma_size = dict_info_.limit_lemma_size;
  stat->reclaim_ratio = dict_info_.reclaim_ratio;
  return true;
}

void UserDict::set_limit(uint32 max_lemma_count,
                         uint32 max_lemma_size, uint32 reclaim_ratio) {
  dict_info_.limit_lemma_count = max_lemma_count;
  dict_info_.limit_lemma_size = max_lemma_size;
  if (reclaim_ratio > 100)
    reclaim_ratio = 100;
  dict_info_.reclaim_ratio = reclaim_ratio;
}

void UserDict::reclaim() {
  if (is_valid_state() == false)
    return;

  switch (dict_info_.reclaim_ratio) {
    case 0:
      return;
    case 100:
      // TODO: CLEAR to be implemented
      assert(false);
      return;
    default:
      break;
  }

  // XXX Reclaim is only based on count, not size
  uint32 count = dict_info_.lemma_count;
  int rc = count * dict_info_.reclaim_ratio / 100;

  UserDictScoreOffsetPair * score_offset_pairs = NULL;
  score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
      sizeof(UserDictScoreOffsetPair) * rc);
  if (score_offset_pairs == NULL) {
    return;
  }

  for (int i = 0; i < rc; i++) {
    int s = scores_[i];
    score_offset_pairs[i].score = s;
    score_offset_pairs[i].offset_index = i;
  }

  for (int i = (rc + 1) / 2; i >= 0; i--)
    shift_down(score_offset_pairs, i, rc);

  for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
    int s = scores_[i];
    if (s < score_offset_pairs[0].score) {
      score_offset_pairs[0].score = s;
      score_offset_pairs[0].offset_index = i;
      shift_down(score_offset_pairs, 0, rc);
    }
  }

  for (int i = 0; i < rc; i++) {
    int off = score_offset_pairs[i].offset_index;
    remove_lemma_by_offset_index(off);
  }
  if (rc > 0) {
    if (state_ < USER_DICT_OFFSET_DIRTY)
      state_ = USER_DICT_OFFSET_DIRTY;
  }

  free(score_offset_pairs);
}

inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
  int s = sop[i].score;
  int p = sop[i].offset_index;
  sop[i].score = sop[j].score;
  sop[i].offset_index = sop[j].offset_index;
  sop[j].score = s;
  sop[j].offset_index = p;
}

void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
  int par = i;
  while (par < n) {
    int left = par * 2 + 1;
    int right = left + 1;
    if (left >= n && right >= n)
      break;
    if (right >= n) {
      if (sop[left].score > sop[par].score) {
        swap(sop, left, par);
        par = left;
        continue;
      }
    } else if (sop[left].score > sop[right].score &&
               sop[left].score > sop[par].score) {
      swap(sop, left, par);
      par = left;
      continue;
    } else if (sop[right].score > sop[left].score &&
               sop[right].score > sop[par].score) {
      swap(sop, right, par);
      par = right;
      continue;
    }
    break;
  }
}

LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
                                uint16 lemma_len, uint16 count) {
  return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
}

LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
                                 uint16 lemma_len, uint16 count, uint64 lmt) {
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  if (is_valid_state() == false)
    return 0;
  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
  if (off != -1) {
    int delta_score = count - scores_[off];
    dict_info_.total_nfreq += delta_score;
    scores_[off] = build_score(lmt, count);
    if (state_ < USER_DICT_SCORE_DIRTY)
      state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
    DEBUG_PERF_END;
    LOGD_PERF("_put_lemma(update)");
#endif
    return ids_[off];
  } else {
    if ((dict_info_.limit_lemma_count > 0 &&
        dict_info_.lemma_count >= dict_info_.limit_lemma_count)
        || (dict_info_.limit_lemma_size > 0 &&
            dict_info_.lemma_size + (2 + (lemma_len << 2))
            > dict_info_.limit_lemma_size)) {
      // XXX Don't defragment here, it's too time-consuming.
      return 0;
    }
    int flushed = 0;
    if (lemma_count_left_ == 0 ||
        lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {

      // XXX When there is no space for new lemma, we flush to disk
      // flush_cache() may be called by upper user
      // and better place shoule be found instead of here
      flush_cache();
      flushed = 1;
      // Or simply return and do nothing
      // return 0;
    }
#ifdef ___DEBUG_PERF___
    DEBUG_PERF_END;
    LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
#endif
    LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
#ifdef ___SYNC_ENABLED___
    if (syncs_ && id != 0) {
      queue_lemma_for_sync(id);
    }
#endif
    return id;
  }
  return 0;
}

#ifdef ___SYNC_ENABLED___
void UserDict::queue_lemma_for_sync(LemmaIdType id) {
  if (dict_info_.sync_count < sync_count_size_) {
    syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
  } else {
    uint32 * syncs = (uint32*)realloc(
        syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
    if (syncs) {
      sync_count_size_ += kUserDictPreAlloc;
      syncs_ = syncs;
      syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
    }
  }
}
#endif

LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
                                   bool selected) {
#ifdef ___DEBUG_PERF___
  DEBUG_PERF_BEGIN;
#endif
  if (is_valid_state() == false)
    return 0;
  if (is_valid_lemma_id(lemma_id) == false)
    return 0;
  uint32 offset = offsets_by_id_[lemma_id - start_id_];
  uint8 lemma_len = get_lemma_nchar(offset);
  char16 * lemma_str = get_lemma_word(offset);
  uint16 * splids = get_lemma_spell_ids(offset);

  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
  if (off != -1) {
    int score = scores_[off];
    int count = extract_score_freq(score);
    uint64 lmt = extract_score_lmt(score);
    if (count + delta_count > kUserDictMaxFrequency ||
        count + delta_count < count) {
      delta_count = kUserDictMaxFrequency - count;
    }
    count += delta_count;
    dict_info_.total_nfreq += delta_count;
    if (selected) {
      lmt = time(NULL);
    }
    scores_[off] = build_score(lmt, count);
    if (state_ < USER_DICT_SCORE_DIRTY)
      state_ = USER_DICT_SCORE_DIRTY;
#ifdef ___DEBUG_PERF___
    DEBUG_PERF_END;
    LOGD_PERF("update_lemma");
#endif
#ifdef ___SYNC_ENABLED___
    queue_lemma_for_sync(ids_[off]);
#endif
    return ids_[off];
  }
  return 0;
}

size_t UserDict::get_total_lemma_count() {
  return dict_info_.total_nfreq;
}

void UserDict::set_total_lemma_count_of_others(size_t count) {
  total_other_nfreq_ = count;
}

LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
                                   uint16 lemma_len, uint16 count, uint64 lmt) {
  LemmaIdType id = get_max_lemma_id() + 1;
  size_t offset = dict_info_.lemma_size;
  if (offset > kUserDictOffsetMask)
    return 0;

  lemmas_[offset] = 0;
  lemmas_[offset + 1] = (uint8)lemma_len;
  for (size_t i = 0; i < lemma_len; i++) {
    *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
    *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
        = lemma_str[i];
  }
  uint32 off = dict_info_.lemma_count;
  offsets_[off] = offset;
  scores_[off] = build_score(lmt, count);
  ids_[off] = id;
#ifdef ___PREDICT_ENABLED___
  predicts_[off] = offset;
#endif

  offsets_by_id_[id - start_id_] = offset;

  dict_info_.lemma_count++;
  dict_info_.lemma_size += (2 + (lemma_len << 2));
  lemma_count_left_--;
  lemma_size_left_ -= (2 + (lemma_len << 2));

  // Sort

  UserDictSearchable searchable;
  prepare_locate(&searchable, splids, lemma_len);

  size_t i = 0;
  while (i < off) {
    offset = offsets_[i];
    uint32 nchar = get_lemma_nchar(offset);
    uint16 * spl = get_lemma_spell_ids(offset);

    if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
      break;
    i++;
  }
  if (i != off) {
    uint32 temp = offsets_[off];
    memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
    offsets_[i] = temp;

    temp = scores_[off];
    memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
    scores_[i] = temp;

    temp = ids_[off];
    memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
    ids_[i] = temp;
  }

#ifdef ___PREDICT_ENABLED___
  uint32 j = 0;
  uint16 * words_new = get_lemma_word(predicts_[off]);
  j = locate_where_to_insert_in_predicts(words_new, lemma_len);
  if (j != off) {
    uint32 temp = predicts_[off];
    memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
    predicts_[j] = temp;
  }
#endif

  if (state_ < USER_DICT_LEMMA_DIRTY)
    state_ = USER_DICT_LEMMA_DIRTY;

#ifdef ___CACHE_ENABLED___
  cache_init();
#endif

  dict_info_.total_nfreq += count;
  return id;
}
}