1*4882a593Smuzhiyun /* 2*4882a593Smuzhiyun * Copyright (C) 2009 The Android Open Source Project 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Licensed under the Apache License, Version 2.0 (the "License"); 5*4882a593Smuzhiyun * you may not use this file except in compliance with the License. 6*4882a593Smuzhiyun * You may obtain a copy of the License at 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * http://www.apache.org/licenses/LICENSE-2.0 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * Unless required by applicable law or agreed to in writing, software 11*4882a593Smuzhiyun * distributed under the License is distributed on an "AS IS" BASIS, 12*4882a593Smuzhiyun * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*4882a593Smuzhiyun * See the License for the specific language governing permissions and 14*4882a593Smuzhiyun * limitations under the License. 15*4882a593Smuzhiyun */ 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun #ifndef PINYINIME_INCLUDE_NGRAM_H__ 18*4882a593Smuzhiyun #define PINYINIME_INCLUDE_NGRAM_H__ 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun #include <stdio.h> 21*4882a593Smuzhiyun #include <stdlib.h> 22*4882a593Smuzhiyun #include "./dictdef.h" 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun namespace ime_pinyin { 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun typedef unsigned char CODEBOOK_TYPE; 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun static const size_t kCodeBookSize = 256; 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun class NGram { 31*4882a593Smuzhiyun public: 32*4882a593Smuzhiyun // The maximum score of a lemma item. 33*4882a593Smuzhiyun static const LmaScoreType kMaxScore = 0x3fff; 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun // In order to reduce the storage size, the original log value is amplified by 36*4882a593Smuzhiyun // kScoreAmplifier, and we use LmaScoreType to store. 37*4882a593Smuzhiyun // After this process, an item with a lower score has a higher frequency. 38*4882a593Smuzhiyun static const int kLogValueAmplifier = -800; 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun // System words' total frequency. It is not the real total frequency, instead, 41*4882a593Smuzhiyun // It is only used to adjust system lemmas' scores when the user dictionary's 42*4882a593Smuzhiyun // total frequency changes. 43*4882a593Smuzhiyun // In this version, frequencies of system lemmas are fixed. We are considering 44*4882a593Smuzhiyun // to make them changable in next version. 45*4882a593Smuzhiyun static const size_t kSysDictTotalFreq = 100000000; 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun private: 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun static NGram* instance_; 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun bool initialized_; 52*4882a593Smuzhiyun uint32 idx_num_; 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun size_t total_freq_none_sys_; 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun // Score compensation for system dictionary lemmas. 57*4882a593Smuzhiyun // Because after user adds some user lemmas, the total frequency changes, and 58*4882a593Smuzhiyun // we use this value to normalize the score. 59*4882a593Smuzhiyun float sys_score_compensation_; 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun #ifdef ___BUILD_MODEL___ 62*4882a593Smuzhiyun double *freq_codes_df_; 63*4882a593Smuzhiyun #endif 64*4882a593Smuzhiyun LmaScoreType *freq_codes_; 65*4882a593Smuzhiyun CODEBOOK_TYPE *lma_freq_idx_; 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun public: 68*4882a593Smuzhiyun NGram(); 69*4882a593Smuzhiyun ~NGram(); 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun static NGram& get_instance(); 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun bool save_ngram(FILE *fp); 74*4882a593Smuzhiyun bool load_ngram(FILE *fp); 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun // Set the total frequency of all none system dictionaries. 77*4882a593Smuzhiyun void set_total_freq_none_sys(size_t freq_none_sys); 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun float get_uni_psb(LemmaIdType lma_id); 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun // Convert a probability to score. Actually, the score will be limited to 82*4882a593Smuzhiyun // kMaxScore, but at runtime, we also need float expression to get accurate 83*4882a593Smuzhiyun // value of the score. 84*4882a593Smuzhiyun // After the conversion, a lower score indicates a higher probability of the 85*4882a593Smuzhiyun // item. 86*4882a593Smuzhiyun static float convert_psb_to_score(double psb); 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun #ifdef ___BUILD_MODEL___ 89*4882a593Smuzhiyun // For constructing the unigram mode model. 90*4882a593Smuzhiyun bool build_unigram(LemmaEntry *lemma_arr, size_t num, 91*4882a593Smuzhiyun LemmaIdType next_idx_unused); 92*4882a593Smuzhiyun #endif 93*4882a593Smuzhiyun }; 94*4882a593Smuzhiyun } 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun #endif // PINYINIME_INCLUDE_NGRAM_H__ 97