1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_USERDICT_H__ 18 #define PINYINIME_INCLUDE_USERDICT_H__ 19 20 #define ___CACHE_ENABLED___ 21 #define ___SYNC_ENABLED___ 22 #define ___PREDICT_ENABLED___ 23 24 // Debug performance for operations 25 // #define ___DEBUG_PERF___ 26 27 #ifdef _WIN32 28 #include <winsock.h> // timeval 29 #else 30 #include <pthread.h> 31 #endif 32 #include "atomdictbase.h" 33 34 namespace ime_pinyin { 35 36 class UserDict : public AtomDictBase { 37 public: 38 UserDict(); 39 ~UserDict(); 40 41 bool load_dict(const char *file_name, LemmaIdType start_id, 42 LemmaIdType end_id); 43 44 bool close_dict(); 45 46 size_t number_of_lemmas(); 47 48 void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 49 50 MileStoneHandle extend_dict(MileStoneHandle from_handle, 51 const DictExtPara *dep, LmaPsbItem *lpi_items, 52 size_t lpi_max, size_t *lpi_num); 53 54 size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 55 LmaPsbItem *lpi_items, size_t lpi_max); 56 57 uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, 58 uint16 str_max); 59 60 uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 61 uint16 splids_max, bool arg_valid); 62 63 size_t predict(const char16 last_hzs[], uint16 hzs_len, 64 NPredictItem *npre_items, size_t npre_max, 65 size_t b4_used); 66 67 // Full spelling ids are required 68 LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 69 uint16 lemma_len, uint16 count); 70 71 LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 72 bool selected); 73 74 LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 75 uint16 lemma_len); 76 77 LmaScoreType get_lemma_score(LemmaIdType lemma_id); 78 79 LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 80 uint16 lemma_len); 81 82 bool remove_lemma(LemmaIdType lemma_id); 83 84 size_t get_total_lemma_count(); 85 void set_total_lemma_count_of_others(size_t count); 86 87 void flush_cache(); 88 89 void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, 90 uint32 reclaim_ratio); 91 92 void reclaim(); 93 94 void defragment(); 95 96 #ifdef ___SYNC_ENABLED___ 97 void clear_sync_lemmas(unsigned int start, unsigned int end); 98 99 int get_sync_count(); 100 101 LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], 102 uint16 lemma_len, uint16 count, uint64 lmt); 103 /** 104 * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. 105 * 106 * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' 107 * @param len length of lemmas string in UTF-16LE 108 * @return newly added lemma count 109 */ 110 int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); 111 112 /** 113 * Get lemmas need sync to a UTF-16LE string of above format. 114 * Note: input buffer (str) must not be too small. If str is too small to 115 * contain single one lemma, there might be a dead loop. 116 * 117 * @param str buffer to write lemmas 118 * @param size buffer size in UTF-16LE 119 * @param count output value of lemma returned 120 * @return UTF-16LE string length 121 */ 122 int get_sync_lemmas_in_utf16le_string_from_beginning( 123 char16 * str, int size, int * count); 124 125 #endif 126 127 struct UserDictStat { 128 uint32 version; 129 const char * file_name; 130 struct timeval load_time; 131 struct timeval last_update; 132 uint32 disk_size; 133 uint32 lemma_count; 134 uint32 lemma_size; 135 uint32 delete_count; 136 uint32 delete_size; 137 #ifdef ___SYNC_ENABLED___ 138 uint32 sync_count; 139 #endif 140 uint32 reclaim_ratio; 141 uint32 limit_lemma_count; 142 uint32 limit_lemma_size; 143 }; 144 145 bool state(UserDictStat * stat); 146 147 private: 148 uint32 total_other_nfreq_; 149 struct timeval load_time_; 150 LemmaIdType start_id_; 151 uint32 version_; 152 uint8 * lemmas_; 153 154 // In-Memory-Only flag for each lemma 155 static const uint8 kUserDictLemmaFlagRemove = 1; 156 // Inuse lemmas' offset 157 uint32 * offsets_; 158 // Highest bit in offset tells whether corresponding lemma is removed 159 static const uint32 kUserDictOffsetFlagRemove = (1 << 31); 160 // Maximum possible for the offset 161 static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); 162 // Bit width for last modified time, from 1 to 16 163 static const uint32 kUserDictLMTBitWidth = 16; 164 // Granularity for last modified time in second 165 static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; 166 // Maximum frequency count 167 static const uint16 kUserDictMaxFrequency = 0xFFFF; 168 169 #define COARSE_UTC(year, month, day, hour, minute, second) \ 170 ( \ 171 (year - 1970) * 365 * 24 * 60 * 60 + \ 172 (month - 1) * 30 * 24 * 60 * 60 + \ 173 (day - 1) * 24 * 60 * 60 + \ 174 (hour - 0) * 60 * 60 + \ 175 (minute - 0) * 60 + \ 176 (second - 0) \ 177 ) 178 static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); 179 180 // Correspond to offsets_ 181 uint32 * scores_; 182 // Following two fields are only valid in memory 183 uint32 * ids_; 184 #ifdef ___PREDICT_ENABLED___ 185 uint32 * predicts_; 186 #endif 187 #ifdef ___SYNC_ENABLED___ 188 uint32 * syncs_; 189 size_t sync_count_size_; 190 #endif 191 uint32 * offsets_by_id_; 192 193 size_t lemma_count_left_; 194 size_t lemma_size_left_; 195 196 const char * dict_file_; 197 198 // Be sure size is 4xN 199 struct UserDictInfo { 200 // When limitation reached, how much percentage will be reclaimed (1 ~ 100) 201 uint32 reclaim_ratio; 202 // maximum lemma count, 0 means no limitation 203 uint32 limit_lemma_count; 204 // Maximum lemma size, it's different from 205 // whole disk file size or in-mem dict size 206 // 0 means no limitation 207 uint32 limit_lemma_size; 208 // Total lemma count including deleted and inuse 209 // Also indicate offsets_ size 210 uint32 lemma_count; 211 // Total size of lemmas including used and freed 212 uint32 lemma_size; 213 // Freed lemma count 214 uint32 free_count; 215 // Freed lemma size in byte 216 uint32 free_size; 217 #ifdef ___SYNC_ENABLED___ 218 uint32 sync_count; 219 #endif 220 int32 total_nfreq; 221 } dict_info_; 222 223 static const uint32 kUserDictVersion = 0x0ABCDEF0; 224 225 static const uint32 kUserDictPreAlloc = 32; 226 static const uint32 kUserDictAverageNchar = 8; 227 228 enum UserDictState { 229 // Keep in order 230 USER_DICT_NONE = 0, 231 USER_DICT_SYNC, 232 #ifdef ___SYNC_ENABLED___ 233 USER_DICT_SYNC_DIRTY, 234 #endif 235 USER_DICT_SCORE_DIRTY, 236 USER_DICT_OFFSET_DIRTY, 237 USER_DICT_LEMMA_DIRTY, 238 239 USER_DICT_DEFRAGMENTED, 240 } state_; 241 242 struct UserDictSearchable { 243 uint16 splids_len; 244 uint16 splid_start[kMaxLemmaSize]; 245 uint16 splid_count[kMaxLemmaSize]; 246 // Compact inital letters for both FuzzyCompareSpellId and cache system 247 uint32 signature[kMaxLemmaSize / 4]; 248 }; 249 250 #ifdef ___CACHE_ENABLED___ 251 enum UserDictCacheType { 252 USER_DICT_CACHE, 253 USER_DICT_MISS_CACHE, 254 }; 255 256 static const int kUserDictCacheSize = 4; 257 static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; 258 259 struct UserDictMissCache { 260 uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; 261 uint16 head, tail; 262 } miss_caches_[kMaxLemmaSize]; 263 264 struct UserDictCache { 265 uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; 266 uint32 offsets[kUserDictCacheSize]; 267 uint32 lengths[kUserDictCacheSize]; 268 // Ring buffer 269 uint16 head, tail; 270 } caches_[kMaxLemmaSize]; 271 272 void cache_init(); 273 274 void cache_push(UserDictCacheType type, 275 UserDictSearchable *searchable, 276 uint32 offset, uint32 length); 277 278 bool cache_hit(UserDictSearchable *searchable, 279 uint32 *offset, uint32 *length); 280 281 bool load_cache(UserDictSearchable *searchable, 282 uint32 *offset, uint32 *length); 283 284 void save_cache(UserDictSearchable *searchable, 285 uint32 offset, uint32 length); 286 287 void reset_cache(); 288 289 bool load_miss_cache(UserDictSearchable *searchable); 290 291 void save_miss_cache(UserDictSearchable *searchable); 292 293 void reset_miss_cache(); 294 #endif 295 296 LmaScoreType translate_score(int f); 297 298 int extract_score_freq(int raw_score); 299 300 uint64 extract_score_lmt(int raw_score); 301 302 inline int build_score(uint64 lmt, int freq); 303 304 inline int64 utf16le_atoll(uint16 *s, int len); 305 306 inline int utf16le_lltoa(int64 v, uint16 *s, int size); 307 308 LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], 309 uint16 lemma_len, uint16 count, uint64 lmt); 310 311 size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, 312 LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); 313 314 int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); 315 316 int _get_lemma_score(LemmaIdType lemma_id); 317 318 int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, 319 const UserDictSearchable *searchable); 320 321 bool is_prefix_spell_id(const uint16 * fullids, 322 uint16 fulllen, const UserDictSearchable *searchable); 323 324 uint32 get_dict_file_size(UserDictInfo * info); 325 326 bool reset(const char *file); 327 328 bool validate(const char *file); 329 330 bool load(const char *file, LemmaIdType start_id); 331 332 bool is_valid_state(); 333 334 bool is_valid_lemma_id(LemmaIdType id); 335 336 LemmaIdType get_max_lemma_id(); 337 338 void set_lemma_flag(uint32 offset, uint8 flag); 339 340 char get_lemma_flag(uint32 offset); 341 342 char get_lemma_nchar(uint32 offset); 343 344 uint16 * get_lemma_spell_ids(uint32 offset); 345 346 uint16 * get_lemma_word(uint32 offset); 347 348 // Prepare searchable to fasten locate process 349 void prepare_locate(UserDictSearchable *searchable, 350 const uint16 * splids, uint16 len); 351 352 // Compare initial letters only 353 int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, 354 const UserDictSearchable *searchable); 355 356 // Compare exactly two spell ids 357 // First argument must be a full id spell id 358 bool equal_spell_id(const uint16 * fullids, 359 uint16 fulllen, const UserDictSearchable *searchable); 360 361 // Find first item by initial letters 362 int32 locate_first_in_offsets(const UserDictSearchable *searchable); 363 364 LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], 365 uint16 lemma_len, uint16 count, uint64 lmt); 366 367 // Check if a lemma is in dictionary 368 int32 locate_in_offsets(char16 lemma_str[], 369 uint16 splid_str[], uint16 lemma_len); 370 371 bool remove_lemma_by_offset_index(int offset_index); 372 #ifdef ___PREDICT_ENABLED___ 373 uint32 locate_where_to_insert_in_predicts(const uint16 * words, 374 int lemma_len); 375 376 int32 locate_first_in_predicts(const uint16 * words, int lemma_len); 377 378 void remove_lemma_from_predict_list(uint32 offset); 379 #endif 380 #ifdef ___SYNC_ENABLED___ 381 void queue_lemma_for_sync(LemmaIdType id); 382 383 void remove_lemma_from_sync_list(uint32 offset); 384 385 void write_back_sync(int fd); 386 #endif 387 void write_back_score(int fd); 388 void write_back_offset(int fd); 389 void write_back_lemma(int fd); 390 void write_back_all(int fd); 391 void write_back(); 392 393 struct UserDictScoreOffsetPair { 394 int score; 395 uint32 offset_index; 396 }; 397 398 inline void swap(UserDictScoreOffsetPair * sop, int i, int j); 399 400 void shift_down(UserDictScoreOffsetPair * sop, int i, int n); 401 402 // On-disk format for each lemma 403 // +-------------+ 404 // | Version (4) | 405 // +-------------+ 406 // +-----------+-----------+--------------------+-------------------+ 407 // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | 408 // +-----------+-----------+--------------------+-------------------+ 409 // ... 410 // +-----------------------+ +-------------+ <---Offset of offset 411 // | Offset1 by_splids (4) | ... | OffsetN (4) | 412 // +-----------------------+ +-------------+ 413 #ifdef ___PREDICT_ENABLED___ 414 // +----------------------+ +-------------+ 415 // | Offset1 by_lemma (4) | ... | OffsetN (4) | 416 // +----------------------+ +-------------+ 417 #endif 418 // +------------+ +------------+ 419 // | Score1 (4) | ... | ScoreN (4) | 420 // +------------+ +------------+ 421 #ifdef ___SYNC_ENABLED___ 422 // +-------------+ +-------------+ 423 // | NewAdd1 (4) | ... | NewAddN (4) | 424 // +-------------+ +-------------+ 425 #endif 426 // +----------------+ 427 // | Dict Info (4x) | 428 // +----------------+ 429 }; 430 } 431 432 #endif 433