xref: /OK3568_Linux_fs/app/forlinx/flapp/src/keyboard/pinyin/include/userdict.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_USERDICT_H__
18 #define PINYINIME_INCLUDE_USERDICT_H__
19 
20 #define ___CACHE_ENABLED___
21 #define ___SYNC_ENABLED___
22 #define ___PREDICT_ENABLED___
23 
24 // Debug performance for operations
25 // #define ___DEBUG_PERF___
26 
27 #ifdef _WIN32
28 #include <winsock.h> // timeval
29 #else
30 #include <pthread.h>
31 #endif
32 #include "atomdictbase.h"
33 
34 namespace ime_pinyin {
35 
36 class UserDict : public AtomDictBase {
37  public:
38   UserDict();
39   ~UserDict();
40 
41   bool load_dict(const char *file_name, LemmaIdType start_id,
42                  LemmaIdType end_id);
43 
44   bool close_dict();
45 
46   size_t number_of_lemmas();
47 
48   void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
49 
50   MileStoneHandle extend_dict(MileStoneHandle from_handle,
51                               const DictExtPara *dep, LmaPsbItem *lpi_items,
52                               size_t lpi_max, size_t *lpi_num);
53 
54   size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
55                   LmaPsbItem *lpi_items, size_t lpi_max);
56 
57   uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
58                        uint16 str_max);
59 
60   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
61                           uint16 splids_max, bool arg_valid);
62 
63   size_t predict(const char16 last_hzs[], uint16 hzs_len,
64                  NPredictItem *npre_items, size_t npre_max,
65                  size_t b4_used);
66 
67   // Full spelling ids are required
68   LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
69                         uint16 lemma_len, uint16 count);
70 
71   LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
72                            bool selected);
73 
74   LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
75                            uint16 lemma_len);
76 
77   LmaScoreType get_lemma_score(LemmaIdType lemma_id);
78 
79   LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
80                         uint16 lemma_len);
81 
82   bool remove_lemma(LemmaIdType lemma_id);
83 
84   size_t get_total_lemma_count();
85   void set_total_lemma_count_of_others(size_t count);
86 
87   void flush_cache();
88 
89   void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
90                  uint32 reclaim_ratio);
91 
92   void reclaim();
93 
94   void defragment();
95 
96 #ifdef ___SYNC_ENABLED___
97   void clear_sync_lemmas(unsigned int start, unsigned int end);
98 
99   int get_sync_count();
100 
101   LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
102                         uint16 lemma_len, uint16 count, uint64 lmt);
103    /**
104     * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
105     *
106     * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
107     * @param len length of lemmas string in UTF-16LE
108     * @return newly added lemma count
109     */
110   int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
111 
112   /**
113    * Get lemmas need sync to a UTF-16LE string of above format.
114    * Note: input buffer (str) must not be too small. If str is too small to
115    *       contain single one lemma, there might be a dead loop.
116    *
117    * @param str buffer to write lemmas
118    * @param size buffer size in UTF-16LE
119    * @param count output value of lemma returned
120    * @return UTF-16LE string length
121    */
122   int get_sync_lemmas_in_utf16le_string_from_beginning(
123       char16 * str, int size, int * count);
124 
125 #endif
126 
127   struct UserDictStat {
128     uint32 version;
129     const char * file_name;
130     struct timeval load_time;
131     struct timeval last_update;
132     uint32 disk_size;
133     uint32 lemma_count;
134     uint32 lemma_size;
135     uint32 delete_count;
136     uint32 delete_size;
137 #ifdef ___SYNC_ENABLED___
138     uint32 sync_count;
139 #endif
140     uint32 reclaim_ratio;
141     uint32 limit_lemma_count;
142     uint32 limit_lemma_size;
143   };
144 
145   bool state(UserDictStat * stat);
146 
147  private:
148   uint32 total_other_nfreq_;
149   struct timeval load_time_;
150   LemmaIdType start_id_;
151   uint32 version_;
152   uint8 * lemmas_;
153 
154   // In-Memory-Only flag for each lemma
155   static const uint8 kUserDictLemmaFlagRemove = 1;
156   // Inuse lemmas' offset
157   uint32 * offsets_;
158   // Highest bit in offset tells whether corresponding lemma is removed
159   static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
160   // Maximum possible for the offset
161   static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
162   // Bit width for last modified time, from 1 to 16
163   static const uint32 kUserDictLMTBitWidth = 16;
164   // Granularity for last modified time in second
165   static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
166   // Maximum frequency count
167   static const uint16 kUserDictMaxFrequency = 0xFFFF;
168 
169 #define COARSE_UTC(year, month, day, hour, minute, second) \
170   ( \
171     (year - 1970) * 365 * 24 * 60 * 60 + \
172     (month - 1) * 30 * 24 * 60 * 60 + \
173     (day - 1) * 24 * 60 * 60 + \
174     (hour - 0) * 60 * 60 + \
175     (minute - 0) * 60 + \
176     (second - 0) \
177   )
178   static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
179 
180   // Correspond to offsets_
181   uint32 * scores_;
182   // Following two fields are only valid in memory
183   uint32 * ids_;
184 #ifdef ___PREDICT_ENABLED___
185   uint32 * predicts_;
186 #endif
187 #ifdef ___SYNC_ENABLED___
188   uint32 * syncs_;
189   size_t sync_count_size_;
190 #endif
191   uint32 * offsets_by_id_;
192 
193   size_t lemma_count_left_;
194   size_t lemma_size_left_;
195 
196   const char * dict_file_;
197 
198   // Be sure size is 4xN
199   struct UserDictInfo {
200     // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
201     uint32 reclaim_ratio;
202     // maximum lemma count, 0 means no limitation
203     uint32 limit_lemma_count;
204     // Maximum lemma size, it's different from
205     // whole disk file size or in-mem dict size
206     // 0 means no limitation
207     uint32 limit_lemma_size;
208     // Total lemma count including deleted and inuse
209     // Also indicate offsets_ size
210     uint32 lemma_count;
211     // Total size of lemmas including used and freed
212     uint32 lemma_size;
213     // Freed lemma count
214     uint32 free_count;
215     // Freed lemma size in byte
216     uint32 free_size;
217 #ifdef ___SYNC_ENABLED___
218     uint32 sync_count;
219 #endif
220     int32 total_nfreq;
221   } dict_info_;
222 
223   static const uint32 kUserDictVersion = 0x0ABCDEF0;
224 
225   static const uint32 kUserDictPreAlloc = 32;
226   static const uint32 kUserDictAverageNchar = 8;
227 
228   enum UserDictState {
229     // Keep in order
230     USER_DICT_NONE = 0,
231     USER_DICT_SYNC,
232 #ifdef ___SYNC_ENABLED___
233     USER_DICT_SYNC_DIRTY,
234 #endif
235     USER_DICT_SCORE_DIRTY,
236     USER_DICT_OFFSET_DIRTY,
237     USER_DICT_LEMMA_DIRTY,
238 
239     USER_DICT_DEFRAGMENTED,
240   } state_;
241 
242   struct UserDictSearchable {
243     uint16 splids_len;
244     uint16 splid_start[kMaxLemmaSize];
245     uint16 splid_count[kMaxLemmaSize];
246     // Compact inital letters for both FuzzyCompareSpellId and cache system
247     uint32 signature[kMaxLemmaSize / 4];
248   };
249 
250 #ifdef ___CACHE_ENABLED___
251   enum UserDictCacheType {
252     USER_DICT_CACHE,
253     USER_DICT_MISS_CACHE,
254   };
255 
256   static const int kUserDictCacheSize = 4;
257   static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
258 
259   struct UserDictMissCache {
260     uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
261     uint16 head, tail;
262   } miss_caches_[kMaxLemmaSize];
263 
264   struct UserDictCache {
265     uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
266     uint32 offsets[kUserDictCacheSize];
267     uint32 lengths[kUserDictCacheSize];
268     // Ring buffer
269     uint16 head, tail;
270   } caches_[kMaxLemmaSize];
271 
272   void cache_init();
273 
274   void cache_push(UserDictCacheType type,
275                  UserDictSearchable *searchable,
276                  uint32 offset, uint32 length);
277 
278   bool cache_hit(UserDictSearchable *searchable,
279                  uint32 *offset, uint32 *length);
280 
281   bool load_cache(UserDictSearchable *searchable,
282                   uint32 *offset, uint32 *length);
283 
284   void save_cache(UserDictSearchable *searchable,
285                   uint32 offset, uint32 length);
286 
287   void reset_cache();
288 
289   bool load_miss_cache(UserDictSearchable *searchable);
290 
291   void save_miss_cache(UserDictSearchable *searchable);
292 
293   void reset_miss_cache();
294 #endif
295 
296   LmaScoreType translate_score(int f);
297 
298   int extract_score_freq(int raw_score);
299 
300   uint64 extract_score_lmt(int raw_score);
301 
302   inline int build_score(uint64 lmt, int freq);
303 
304   inline int64 utf16le_atoll(uint16 *s, int len);
305 
306   inline int utf16le_lltoa(int64 v, uint16 *s, int size);
307 
308   LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
309                         uint16 lemma_len, uint16 count, uint64 lmt);
310 
311   size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
312                    LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
313 
314   int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
315 
316   int _get_lemma_score(LemmaIdType lemma_id);
317 
318   int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
319                                const UserDictSearchable *searchable);
320 
321   bool is_prefix_spell_id(const uint16 * fullids,
322                           uint16 fulllen, const UserDictSearchable *searchable);
323 
324   uint32 get_dict_file_size(UserDictInfo * info);
325 
326   bool reset(const char *file);
327 
328   bool validate(const char *file);
329 
330   bool load(const char *file, LemmaIdType start_id);
331 
332   bool is_valid_state();
333 
334   bool is_valid_lemma_id(LemmaIdType id);
335 
336   LemmaIdType get_max_lemma_id();
337 
338   void set_lemma_flag(uint32 offset, uint8 flag);
339 
340   char get_lemma_flag(uint32 offset);
341 
342   char get_lemma_nchar(uint32 offset);
343 
344   uint16 * get_lemma_spell_ids(uint32 offset);
345 
346   uint16 * get_lemma_word(uint32 offset);
347 
348   // Prepare searchable to fasten locate process
349   void prepare_locate(UserDictSearchable *searchable,
350                       const uint16 * splids, uint16 len);
351 
352   // Compare initial letters only
353   int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
354                                const UserDictSearchable *searchable);
355 
356   // Compare exactly two spell ids
357   // First argument must be a full id spell id
358   bool equal_spell_id(const uint16 * fullids,
359                       uint16 fulllen, const UserDictSearchable *searchable);
360 
361   // Find first item by initial letters
362   int32 locate_first_in_offsets(const UserDictSearchable *searchable);
363 
364   LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
365                            uint16 lemma_len, uint16 count, uint64 lmt);
366 
367   // Check if a lemma is in dictionary
368   int32 locate_in_offsets(char16 lemma_str[],
369                           uint16 splid_str[], uint16 lemma_len);
370 
371   bool remove_lemma_by_offset_index(int offset_index);
372 #ifdef ___PREDICT_ENABLED___
373   uint32 locate_where_to_insert_in_predicts(const uint16 * words,
374                                             int lemma_len);
375 
376   int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
377 
378   void remove_lemma_from_predict_list(uint32 offset);
379 #endif
380 #ifdef ___SYNC_ENABLED___
381   void queue_lemma_for_sync(LemmaIdType id);
382 
383   void remove_lemma_from_sync_list(uint32 offset);
384 
385   void write_back_sync(int fd);
386 #endif
387   void write_back_score(int fd);
388   void write_back_offset(int fd);
389   void write_back_lemma(int fd);
390   void write_back_all(int fd);
391   void write_back();
392 
393   struct UserDictScoreOffsetPair {
394     int score;
395     uint32 offset_index;
396   };
397 
398   inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
399 
400   void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
401 
402   // On-disk format for each lemma
403   // +-------------+
404   // | Version (4) |
405   // +-------------+
406   // +-----------+-----------+--------------------+-------------------+
407   // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
408   // +-----------+-----------+--------------------+-------------------+
409   // ...
410   // +-----------------------+     +-------------+      <---Offset of offset
411   // | Offset1 by_splids (4) | ... | OffsetN (4) |
412   // +-----------------------+     +-------------+
413 #ifdef ___PREDICT_ENABLED___
414   // +----------------------+     +-------------+
415   // | Offset1 by_lemma (4) | ... | OffsetN (4) |
416   // +----------------------+     +-------------+
417 #endif
418   // +------------+     +------------+
419   // | Score1 (4) | ... | ScoreN (4) |
420   // +------------+     +------------+
421 #ifdef ___SYNC_ENABLED___
422   // +-------------+     +-------------+
423   // | NewAdd1 (4) | ... | NewAddN (4) |
424   // +-------------+     +-------------+
425 #endif
426   // +----------------+
427   // | Dict Info (4x) |
428   // +----------------+
429 };
430 }
431 
432 #endif
433