1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * LZMA2 definitions
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Authors: Lasse Collin <lasse.collin@tukaani.org>
5*4882a593Smuzhiyun * Igor Pavlov <https://7-zip.org/>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * This file has been put into the public domain.
8*4882a593Smuzhiyun * You can do whatever you want with this file.
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun #ifndef XZ_LZMA2_H
12*4882a593Smuzhiyun #define XZ_LZMA2_H
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun /* Range coder constants */
15*4882a593Smuzhiyun #define RC_SHIFT_BITS 8
16*4882a593Smuzhiyun #define RC_TOP_BITS 24
17*4882a593Smuzhiyun #define RC_TOP_VALUE (1 << RC_TOP_BITS)
18*4882a593Smuzhiyun #define RC_BIT_MODEL_TOTAL_BITS 11
19*4882a593Smuzhiyun #define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS)
20*4882a593Smuzhiyun #define RC_MOVE_BITS 5
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun /*
23*4882a593Smuzhiyun * Maximum number of position states. A position state is the lowest pb
24*4882a593Smuzhiyun * number of bits of the current uncompressed offset. In some places there
25*4882a593Smuzhiyun * are different sets of probabilities for different position states.
26*4882a593Smuzhiyun */
27*4882a593Smuzhiyun #define POS_STATES_MAX (1 << 4)
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun /*
30*4882a593Smuzhiyun * This enum is used to track which LZMA symbols have occurred most recently
31*4882a593Smuzhiyun * and in which order. This information is used to predict the next symbol.
32*4882a593Smuzhiyun *
33*4882a593Smuzhiyun * Symbols:
34*4882a593Smuzhiyun * - Literal: One 8-bit byte
35*4882a593Smuzhiyun * - Match: Repeat a chunk of data at some distance
36*4882a593Smuzhiyun * - Long repeat: Multi-byte match at a recently seen distance
37*4882a593Smuzhiyun * - Short repeat: One-byte repeat at a recently seen distance
38*4882a593Smuzhiyun *
39*4882a593Smuzhiyun * The symbol names are in from STATE_oldest_older_previous. REP means
40*4882a593Smuzhiyun * either short or long repeated match, and NONLIT means any non-literal.
41*4882a593Smuzhiyun */
42*4882a593Smuzhiyun enum lzma_state {
43*4882a593Smuzhiyun STATE_LIT_LIT,
44*4882a593Smuzhiyun STATE_MATCH_LIT_LIT,
45*4882a593Smuzhiyun STATE_REP_LIT_LIT,
46*4882a593Smuzhiyun STATE_SHORTREP_LIT_LIT,
47*4882a593Smuzhiyun STATE_MATCH_LIT,
48*4882a593Smuzhiyun STATE_REP_LIT,
49*4882a593Smuzhiyun STATE_SHORTREP_LIT,
50*4882a593Smuzhiyun STATE_LIT_MATCH,
51*4882a593Smuzhiyun STATE_LIT_LONGREP,
52*4882a593Smuzhiyun STATE_LIT_SHORTREP,
53*4882a593Smuzhiyun STATE_NONLIT_MATCH,
54*4882a593Smuzhiyun STATE_NONLIT_REP
55*4882a593Smuzhiyun };
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun /* Total number of states */
58*4882a593Smuzhiyun #define STATES 12
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun /* The lowest 7 states indicate that the previous state was a literal. */
61*4882a593Smuzhiyun #define LIT_STATES 7
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun /* Indicate that the latest symbol was a literal. */
lzma_state_literal(enum lzma_state * state)64*4882a593Smuzhiyun static inline void lzma_state_literal(enum lzma_state *state)
65*4882a593Smuzhiyun {
66*4882a593Smuzhiyun if (*state <= STATE_SHORTREP_LIT_LIT)
67*4882a593Smuzhiyun *state = STATE_LIT_LIT;
68*4882a593Smuzhiyun else if (*state <= STATE_LIT_SHORTREP)
69*4882a593Smuzhiyun *state -= 3;
70*4882a593Smuzhiyun else
71*4882a593Smuzhiyun *state -= 6;
72*4882a593Smuzhiyun }
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun /* Indicate that the latest symbol was a match. */
lzma_state_match(enum lzma_state * state)75*4882a593Smuzhiyun static inline void lzma_state_match(enum lzma_state *state)
76*4882a593Smuzhiyun {
77*4882a593Smuzhiyun *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH;
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /* Indicate that the latest state was a long repeated match. */
lzma_state_long_rep(enum lzma_state * state)81*4882a593Smuzhiyun static inline void lzma_state_long_rep(enum lzma_state *state)
82*4882a593Smuzhiyun {
83*4882a593Smuzhiyun *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun /* Indicate that the latest symbol was a short match. */
lzma_state_short_rep(enum lzma_state * state)87*4882a593Smuzhiyun static inline void lzma_state_short_rep(enum lzma_state *state)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP;
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun /* Test if the previous symbol was a literal. */
lzma_state_is_literal(enum lzma_state state)93*4882a593Smuzhiyun static inline bool lzma_state_is_literal(enum lzma_state state)
94*4882a593Smuzhiyun {
95*4882a593Smuzhiyun return state < LIT_STATES;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun /* Each literal coder is divided in three sections:
99*4882a593Smuzhiyun * - 0x001-0x0FF: Without match byte
100*4882a593Smuzhiyun * - 0x101-0x1FF: With match byte; match bit is 0
101*4882a593Smuzhiyun * - 0x201-0x2FF: With match byte; match bit is 1
102*4882a593Smuzhiyun *
103*4882a593Smuzhiyun * Match byte is used when the previous LZMA symbol was something else than
104*4882a593Smuzhiyun * a literal (that is, it was some kind of match).
105*4882a593Smuzhiyun */
106*4882a593Smuzhiyun #define LITERAL_CODER_SIZE 0x300
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun /* Maximum number of literal coders */
109*4882a593Smuzhiyun #define LITERAL_CODERS_MAX (1 << 4)
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun /* Minimum length of a match is two bytes. */
112*4882a593Smuzhiyun #define MATCH_LEN_MIN 2
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun /* Match length is encoded with 4, 5, or 10 bits.
115*4882a593Smuzhiyun *
116*4882a593Smuzhiyun * Length Bits
117*4882a593Smuzhiyun * 2-9 4 = Choice=0 + 3 bits
118*4882a593Smuzhiyun * 10-17 5 = Choice=1 + Choice2=0 + 3 bits
119*4882a593Smuzhiyun * 18-273 10 = Choice=1 + Choice2=1 + 8 bits
120*4882a593Smuzhiyun */
121*4882a593Smuzhiyun #define LEN_LOW_BITS 3
122*4882a593Smuzhiyun #define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS)
123*4882a593Smuzhiyun #define LEN_MID_BITS 3
124*4882a593Smuzhiyun #define LEN_MID_SYMBOLS (1 << LEN_MID_BITS)
125*4882a593Smuzhiyun #define LEN_HIGH_BITS 8
126*4882a593Smuzhiyun #define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS)
127*4882a593Smuzhiyun #define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS)
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun /*
130*4882a593Smuzhiyun * Maximum length of a match is 273 which is a result of the encoding
131*4882a593Smuzhiyun * described above.
132*4882a593Smuzhiyun */
133*4882a593Smuzhiyun #define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1)
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun /*
136*4882a593Smuzhiyun * Different sets of probabilities are used for match distances that have
137*4882a593Smuzhiyun * very short match length: Lengths of 2, 3, and 4 bytes have a separate
138*4882a593Smuzhiyun * set of probabilities for each length. The matches with longer length
139*4882a593Smuzhiyun * use a shared set of probabilities.
140*4882a593Smuzhiyun */
141*4882a593Smuzhiyun #define DIST_STATES 4
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun /*
144*4882a593Smuzhiyun * Get the index of the appropriate probability array for decoding
145*4882a593Smuzhiyun * the distance slot.
146*4882a593Smuzhiyun */
lzma_get_dist_state(uint32_t len)147*4882a593Smuzhiyun static inline uint32_t lzma_get_dist_state(uint32_t len)
148*4882a593Smuzhiyun {
149*4882a593Smuzhiyun return len < DIST_STATES + MATCH_LEN_MIN
150*4882a593Smuzhiyun ? len - MATCH_LEN_MIN : DIST_STATES - 1;
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun /*
154*4882a593Smuzhiyun * The highest two bits of a 32-bit match distance are encoded using six bits.
155*4882a593Smuzhiyun * This six-bit value is called a distance slot. This way encoding a 32-bit
156*4882a593Smuzhiyun * value takes 6-36 bits, larger values taking more bits.
157*4882a593Smuzhiyun */
158*4882a593Smuzhiyun #define DIST_SLOT_BITS 6
159*4882a593Smuzhiyun #define DIST_SLOTS (1 << DIST_SLOT_BITS)
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun /* Match distances up to 127 are fully encoded using probabilities. Since
162*4882a593Smuzhiyun * the highest two bits (distance slot) are always encoded using six bits,
163*4882a593Smuzhiyun * the distances 0-3 don't need any additional bits to encode, since the
164*4882a593Smuzhiyun * distance slot itself is the same as the actual distance. DIST_MODEL_START
165*4882a593Smuzhiyun * indicates the first distance slot where at least one additional bit is
166*4882a593Smuzhiyun * needed.
167*4882a593Smuzhiyun */
168*4882a593Smuzhiyun #define DIST_MODEL_START 4
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun /*
171*4882a593Smuzhiyun * Match distances greater than 127 are encoded in three pieces:
172*4882a593Smuzhiyun * - distance slot: the highest two bits
173*4882a593Smuzhiyun * - direct bits: 2-26 bits below the highest two bits
174*4882a593Smuzhiyun * - alignment bits: four lowest bits
175*4882a593Smuzhiyun *
176*4882a593Smuzhiyun * Direct bits don't use any probabilities.
177*4882a593Smuzhiyun *
178*4882a593Smuzhiyun * The distance slot value of 14 is for distances 128-191.
179*4882a593Smuzhiyun */
180*4882a593Smuzhiyun #define DIST_MODEL_END 14
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun /* Distance slots that indicate a distance <= 127. */
183*4882a593Smuzhiyun #define FULL_DISTANCES_BITS (DIST_MODEL_END / 2)
184*4882a593Smuzhiyun #define FULL_DISTANCES (1 << FULL_DISTANCES_BITS)
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun /*
187*4882a593Smuzhiyun * For match distances greater than 127, only the highest two bits and the
188*4882a593Smuzhiyun * lowest four bits (alignment) is encoded using probabilities.
189*4882a593Smuzhiyun */
190*4882a593Smuzhiyun #define ALIGN_BITS 4
191*4882a593Smuzhiyun #define ALIGN_SIZE (1 << ALIGN_BITS)
192*4882a593Smuzhiyun #define ALIGN_MASK (ALIGN_SIZE - 1)
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun /* Total number of all probability variables */
195*4882a593Smuzhiyun #define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE)
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun /*
198*4882a593Smuzhiyun * LZMA remembers the four most recent match distances. Reusing these
199*4882a593Smuzhiyun * distances tends to take less space than re-encoding the actual
200*4882a593Smuzhiyun * distance value.
201*4882a593Smuzhiyun */
202*4882a593Smuzhiyun #define REPS 4
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun #endif
205