xref: /OK3568_Linux_fs/kernel/lib/xz/xz_lzma2.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * LZMA2 definitions
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5*4882a593Smuzhiyun  *          Igor Pavlov <https://7-zip.org/>
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * This file has been put into the public domain.
8*4882a593Smuzhiyun  * You can do whatever you want with this file.
9*4882a593Smuzhiyun  */
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun #ifndef XZ_LZMA2_H
12*4882a593Smuzhiyun #define XZ_LZMA2_H
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun /* Range coder constants */
15*4882a593Smuzhiyun #define RC_SHIFT_BITS 8
16*4882a593Smuzhiyun #define RC_TOP_BITS 24
17*4882a593Smuzhiyun #define RC_TOP_VALUE (1 << RC_TOP_BITS)
18*4882a593Smuzhiyun #define RC_BIT_MODEL_TOTAL_BITS 11
19*4882a593Smuzhiyun #define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS)
20*4882a593Smuzhiyun #define RC_MOVE_BITS 5
21*4882a593Smuzhiyun 
22*4882a593Smuzhiyun /*
23*4882a593Smuzhiyun  * Maximum number of position states. A position state is the lowest pb
24*4882a593Smuzhiyun  * number of bits of the current uncompressed offset. In some places there
25*4882a593Smuzhiyun  * are different sets of probabilities for different position states.
26*4882a593Smuzhiyun  */
27*4882a593Smuzhiyun #define POS_STATES_MAX (1 << 4)
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun /*
30*4882a593Smuzhiyun  * This enum is used to track which LZMA symbols have occurred most recently
31*4882a593Smuzhiyun  * and in which order. This information is used to predict the next symbol.
32*4882a593Smuzhiyun  *
33*4882a593Smuzhiyun  * Symbols:
34*4882a593Smuzhiyun  *  - Literal: One 8-bit byte
35*4882a593Smuzhiyun  *  - Match: Repeat a chunk of data at some distance
36*4882a593Smuzhiyun  *  - Long repeat: Multi-byte match at a recently seen distance
37*4882a593Smuzhiyun  *  - Short repeat: One-byte repeat at a recently seen distance
38*4882a593Smuzhiyun  *
39*4882a593Smuzhiyun  * The symbol names are in from STATE_oldest_older_previous. REP means
40*4882a593Smuzhiyun  * either short or long repeated match, and NONLIT means any non-literal.
41*4882a593Smuzhiyun  */
42*4882a593Smuzhiyun enum lzma_state {
43*4882a593Smuzhiyun 	STATE_LIT_LIT,
44*4882a593Smuzhiyun 	STATE_MATCH_LIT_LIT,
45*4882a593Smuzhiyun 	STATE_REP_LIT_LIT,
46*4882a593Smuzhiyun 	STATE_SHORTREP_LIT_LIT,
47*4882a593Smuzhiyun 	STATE_MATCH_LIT,
48*4882a593Smuzhiyun 	STATE_REP_LIT,
49*4882a593Smuzhiyun 	STATE_SHORTREP_LIT,
50*4882a593Smuzhiyun 	STATE_LIT_MATCH,
51*4882a593Smuzhiyun 	STATE_LIT_LONGREP,
52*4882a593Smuzhiyun 	STATE_LIT_SHORTREP,
53*4882a593Smuzhiyun 	STATE_NONLIT_MATCH,
54*4882a593Smuzhiyun 	STATE_NONLIT_REP
55*4882a593Smuzhiyun };
56*4882a593Smuzhiyun 
57*4882a593Smuzhiyun /* Total number of states */
58*4882a593Smuzhiyun #define STATES 12
59*4882a593Smuzhiyun 
60*4882a593Smuzhiyun /* The lowest 7 states indicate that the previous state was a literal. */
61*4882a593Smuzhiyun #define LIT_STATES 7
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun /* Indicate that the latest symbol was a literal. */
lzma_state_literal(enum lzma_state * state)64*4882a593Smuzhiyun static inline void lzma_state_literal(enum lzma_state *state)
65*4882a593Smuzhiyun {
66*4882a593Smuzhiyun 	if (*state <= STATE_SHORTREP_LIT_LIT)
67*4882a593Smuzhiyun 		*state = STATE_LIT_LIT;
68*4882a593Smuzhiyun 	else if (*state <= STATE_LIT_SHORTREP)
69*4882a593Smuzhiyun 		*state -= 3;
70*4882a593Smuzhiyun 	else
71*4882a593Smuzhiyun 		*state -= 6;
72*4882a593Smuzhiyun }
73*4882a593Smuzhiyun 
74*4882a593Smuzhiyun /* Indicate that the latest symbol was a match. */
lzma_state_match(enum lzma_state * state)75*4882a593Smuzhiyun static inline void lzma_state_match(enum lzma_state *state)
76*4882a593Smuzhiyun {
77*4882a593Smuzhiyun 	*state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH;
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun 
80*4882a593Smuzhiyun /* Indicate that the latest state was a long repeated match. */
lzma_state_long_rep(enum lzma_state * state)81*4882a593Smuzhiyun static inline void lzma_state_long_rep(enum lzma_state *state)
82*4882a593Smuzhiyun {
83*4882a593Smuzhiyun 	*state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun 
86*4882a593Smuzhiyun /* Indicate that the latest symbol was a short match. */
lzma_state_short_rep(enum lzma_state * state)87*4882a593Smuzhiyun static inline void lzma_state_short_rep(enum lzma_state *state)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun 	*state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP;
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun /* Test if the previous symbol was a literal. */
lzma_state_is_literal(enum lzma_state state)93*4882a593Smuzhiyun static inline bool lzma_state_is_literal(enum lzma_state state)
94*4882a593Smuzhiyun {
95*4882a593Smuzhiyun 	return state < LIT_STATES;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun 
98*4882a593Smuzhiyun /* Each literal coder is divided in three sections:
99*4882a593Smuzhiyun  *   - 0x001-0x0FF: Without match byte
100*4882a593Smuzhiyun  *   - 0x101-0x1FF: With match byte; match bit is 0
101*4882a593Smuzhiyun  *   - 0x201-0x2FF: With match byte; match bit is 1
102*4882a593Smuzhiyun  *
103*4882a593Smuzhiyun  * Match byte is used when the previous LZMA symbol was something else than
104*4882a593Smuzhiyun  * a literal (that is, it was some kind of match).
105*4882a593Smuzhiyun  */
106*4882a593Smuzhiyun #define LITERAL_CODER_SIZE 0x300
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun /* Maximum number of literal coders */
109*4882a593Smuzhiyun #define LITERAL_CODERS_MAX (1 << 4)
110*4882a593Smuzhiyun 
111*4882a593Smuzhiyun /* Minimum length of a match is two bytes. */
112*4882a593Smuzhiyun #define MATCH_LEN_MIN 2
113*4882a593Smuzhiyun 
114*4882a593Smuzhiyun /* Match length is encoded with 4, 5, or 10 bits.
115*4882a593Smuzhiyun  *
116*4882a593Smuzhiyun  * Length   Bits
117*4882a593Smuzhiyun  *  2-9      4 = Choice=0 + 3 bits
118*4882a593Smuzhiyun  * 10-17     5 = Choice=1 + Choice2=0 + 3 bits
119*4882a593Smuzhiyun  * 18-273   10 = Choice=1 + Choice2=1 + 8 bits
120*4882a593Smuzhiyun  */
121*4882a593Smuzhiyun #define LEN_LOW_BITS 3
122*4882a593Smuzhiyun #define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS)
123*4882a593Smuzhiyun #define LEN_MID_BITS 3
124*4882a593Smuzhiyun #define LEN_MID_SYMBOLS (1 << LEN_MID_BITS)
125*4882a593Smuzhiyun #define LEN_HIGH_BITS 8
126*4882a593Smuzhiyun #define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS)
127*4882a593Smuzhiyun #define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS)
128*4882a593Smuzhiyun 
129*4882a593Smuzhiyun /*
130*4882a593Smuzhiyun  * Maximum length of a match is 273 which is a result of the encoding
131*4882a593Smuzhiyun  * described above.
132*4882a593Smuzhiyun  */
133*4882a593Smuzhiyun #define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1)
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun /*
136*4882a593Smuzhiyun  * Different sets of probabilities are used for match distances that have
137*4882a593Smuzhiyun  * very short match length: Lengths of 2, 3, and 4 bytes have a separate
138*4882a593Smuzhiyun  * set of probabilities for each length. The matches with longer length
139*4882a593Smuzhiyun  * use a shared set of probabilities.
140*4882a593Smuzhiyun  */
141*4882a593Smuzhiyun #define DIST_STATES 4
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun /*
144*4882a593Smuzhiyun  * Get the index of the appropriate probability array for decoding
145*4882a593Smuzhiyun  * the distance slot.
146*4882a593Smuzhiyun  */
lzma_get_dist_state(uint32_t len)147*4882a593Smuzhiyun static inline uint32_t lzma_get_dist_state(uint32_t len)
148*4882a593Smuzhiyun {
149*4882a593Smuzhiyun 	return len < DIST_STATES + MATCH_LEN_MIN
150*4882a593Smuzhiyun 			? len - MATCH_LEN_MIN : DIST_STATES - 1;
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun /*
154*4882a593Smuzhiyun  * The highest two bits of a 32-bit match distance are encoded using six bits.
155*4882a593Smuzhiyun  * This six-bit value is called a distance slot. This way encoding a 32-bit
156*4882a593Smuzhiyun  * value takes 6-36 bits, larger values taking more bits.
157*4882a593Smuzhiyun  */
158*4882a593Smuzhiyun #define DIST_SLOT_BITS 6
159*4882a593Smuzhiyun #define DIST_SLOTS (1 << DIST_SLOT_BITS)
160*4882a593Smuzhiyun 
161*4882a593Smuzhiyun /* Match distances up to 127 are fully encoded using probabilities. Since
162*4882a593Smuzhiyun  * the highest two bits (distance slot) are always encoded using six bits,
163*4882a593Smuzhiyun  * the distances 0-3 don't need any additional bits to encode, since the
164*4882a593Smuzhiyun  * distance slot itself is the same as the actual distance. DIST_MODEL_START
165*4882a593Smuzhiyun  * indicates the first distance slot where at least one additional bit is
166*4882a593Smuzhiyun  * needed.
167*4882a593Smuzhiyun  */
168*4882a593Smuzhiyun #define DIST_MODEL_START 4
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun /*
171*4882a593Smuzhiyun  * Match distances greater than 127 are encoded in three pieces:
172*4882a593Smuzhiyun  *   - distance slot: the highest two bits
173*4882a593Smuzhiyun  *   - direct bits: 2-26 bits below the highest two bits
174*4882a593Smuzhiyun  *   - alignment bits: four lowest bits
175*4882a593Smuzhiyun  *
176*4882a593Smuzhiyun  * Direct bits don't use any probabilities.
177*4882a593Smuzhiyun  *
178*4882a593Smuzhiyun  * The distance slot value of 14 is for distances 128-191.
179*4882a593Smuzhiyun  */
180*4882a593Smuzhiyun #define DIST_MODEL_END 14
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun /* Distance slots that indicate a distance <= 127. */
183*4882a593Smuzhiyun #define FULL_DISTANCES_BITS (DIST_MODEL_END / 2)
184*4882a593Smuzhiyun #define FULL_DISTANCES (1 << FULL_DISTANCES_BITS)
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun /*
187*4882a593Smuzhiyun  * For match distances greater than 127, only the highest two bits and the
188*4882a593Smuzhiyun  * lowest four bits (alignment) is encoded using probabilities.
189*4882a593Smuzhiyun  */
190*4882a593Smuzhiyun #define ALIGN_BITS 4
191*4882a593Smuzhiyun #define ALIGN_SIZE (1 << ALIGN_BITS)
192*4882a593Smuzhiyun #define ALIGN_MASK (ALIGN_SIZE - 1)
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun /* Total number of all probability variables */
195*4882a593Smuzhiyun #define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE)
196*4882a593Smuzhiyun 
197*4882a593Smuzhiyun /*
198*4882a593Smuzhiyun  * LZMA remembers the four most recent match distances. Reusing these
199*4882a593Smuzhiyun  * distances tends to take less space than re-encoding the actual
200*4882a593Smuzhiyun  * distance value.
201*4882a593Smuzhiyun  */
202*4882a593Smuzhiyun #define REPS 4
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun #endif
205