xref: /OK3568_Linux_fs/kernel/drivers/media/test-drivers/vicodec/codec-fwht.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: LGPL-2.1+
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright 2016 Tom aan de Wiel
4*4882a593Smuzhiyun  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5*4882a593Smuzhiyun  *
6*4882a593Smuzhiyun  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7*4882a593Smuzhiyun  *
8*4882a593Smuzhiyun  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9*4882a593Smuzhiyun  * R.D. Brown, 1977
10*4882a593Smuzhiyun  */
11*4882a593Smuzhiyun 
12*4882a593Smuzhiyun #include <linux/string.h>
13*4882a593Smuzhiyun #include <linux/kernel.h>
14*4882a593Smuzhiyun #include "codec-fwht.h"
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun #define OVERFLOW_BIT BIT(14)
17*4882a593Smuzhiyun 
18*4882a593Smuzhiyun /*
19*4882a593Smuzhiyun  * Note: bit 0 of the header must always be 0. Otherwise it cannot
20*4882a593Smuzhiyun  * be guaranteed that the magic 8 byte sequence (see below) can
21*4882a593Smuzhiyun  * never occur in the rlc output.
22*4882a593Smuzhiyun  */
23*4882a593Smuzhiyun #define PFRAME_BIT BIT(15)
24*4882a593Smuzhiyun #define DUPS_MASK 0x1ffe
25*4882a593Smuzhiyun 
26*4882a593Smuzhiyun #define PBLOCK 0
27*4882a593Smuzhiyun #define IBLOCK 1
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun #define ALL_ZEROS 15
30*4882a593Smuzhiyun 
31*4882a593Smuzhiyun static const uint8_t zigzag[64] = {
32*4882a593Smuzhiyun 	0,
33*4882a593Smuzhiyun 	1,  8,
34*4882a593Smuzhiyun 	2,  9, 16,
35*4882a593Smuzhiyun 	3, 10, 17, 24,
36*4882a593Smuzhiyun 	4, 11, 18, 25, 32,
37*4882a593Smuzhiyun 	5, 12, 19, 26, 33, 40,
38*4882a593Smuzhiyun 	6, 13, 20, 27, 34, 41, 48,
39*4882a593Smuzhiyun 	7, 14, 21, 28, 35, 42, 49, 56,
40*4882a593Smuzhiyun 	15, 22, 29, 36, 43, 50, 57,
41*4882a593Smuzhiyun 	23, 30, 37, 44, 51, 58,
42*4882a593Smuzhiyun 	31, 38, 45, 52, 59,
43*4882a593Smuzhiyun 	39, 46, 53, 60,
44*4882a593Smuzhiyun 	47, 54, 61,
45*4882a593Smuzhiyun 	55, 62,
46*4882a593Smuzhiyun 	63,
47*4882a593Smuzhiyun };
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun /*
50*4882a593Smuzhiyun  * noinline_for_stack to work around
51*4882a593Smuzhiyun  * https://bugs.llvm.org/show_bug.cgi?id=38809
52*4882a593Smuzhiyun  */
53*4882a593Smuzhiyun static int noinline_for_stack
rlc(const s16 * in,__be16 * output,int blocktype)54*4882a593Smuzhiyun rlc(const s16 *in, __be16 *output, int blocktype)
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun 	s16 block[8 * 8];
57*4882a593Smuzhiyun 	s16 *wp = block;
58*4882a593Smuzhiyun 	int i = 0;
59*4882a593Smuzhiyun 	int x, y;
60*4882a593Smuzhiyun 	int ret = 0;
61*4882a593Smuzhiyun 
62*4882a593Smuzhiyun 	/* read in block from framebuffer */
63*4882a593Smuzhiyun 	int lastzero_run = 0;
64*4882a593Smuzhiyun 	int to_encode;
65*4882a593Smuzhiyun 
66*4882a593Smuzhiyun 	for (y = 0; y < 8; y++) {
67*4882a593Smuzhiyun 		for (x = 0; x < 8; x++) {
68*4882a593Smuzhiyun 			*wp = in[x + y * 8];
69*4882a593Smuzhiyun 			wp++;
70*4882a593Smuzhiyun 		}
71*4882a593Smuzhiyun 	}
72*4882a593Smuzhiyun 
73*4882a593Smuzhiyun 	/* keep track of amount of trailing zeros */
74*4882a593Smuzhiyun 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
75*4882a593Smuzhiyun 		lastzero_run++;
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
78*4882a593Smuzhiyun 	ret++;
79*4882a593Smuzhiyun 
80*4882a593Smuzhiyun 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	i = 0;
83*4882a593Smuzhiyun 	while (i < to_encode) {
84*4882a593Smuzhiyun 		int cnt = 0;
85*4882a593Smuzhiyun 		int tmp;
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun 		/* count leading zeros */
88*4882a593Smuzhiyun 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
89*4882a593Smuzhiyun 			cnt++;
90*4882a593Smuzhiyun 			i++;
91*4882a593Smuzhiyun 			if (i == to_encode) {
92*4882a593Smuzhiyun 				cnt--;
93*4882a593Smuzhiyun 				break;
94*4882a593Smuzhiyun 			}
95*4882a593Smuzhiyun 		}
96*4882a593Smuzhiyun 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
97*4882a593Smuzhiyun 		*output++ = htons((cnt | tmp << 4));
98*4882a593Smuzhiyun 		i++;
99*4882a593Smuzhiyun 		ret++;
100*4882a593Smuzhiyun 	}
101*4882a593Smuzhiyun 	if (lastzero_run > 14) {
102*4882a593Smuzhiyun 		*output = htons(ALL_ZEROS | 0);
103*4882a593Smuzhiyun 		ret++;
104*4882a593Smuzhiyun 	}
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun 	return ret;
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun 
109*4882a593Smuzhiyun /*
110*4882a593Smuzhiyun  * This function will worst-case increase rlc_in by 65*2 bytes:
111*4882a593Smuzhiyun  * one s16 value for the header and 8 * 8 coefficients of type s16.
112*4882a593Smuzhiyun  */
113*4882a593Smuzhiyun static noinline_for_stack u16
derlc(const __be16 ** rlc_in,s16 * dwht_out,const __be16 * end_of_input)114*4882a593Smuzhiyun derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
115*4882a593Smuzhiyun {
116*4882a593Smuzhiyun 	/* header */
117*4882a593Smuzhiyun 	const __be16 *input = *rlc_in;
118*4882a593Smuzhiyun 	u16 stat;
119*4882a593Smuzhiyun 	int dec_count = 0;
120*4882a593Smuzhiyun 	s16 block[8 * 8 + 16];
121*4882a593Smuzhiyun 	s16 *wp = block;
122*4882a593Smuzhiyun 	int i;
123*4882a593Smuzhiyun 
124*4882a593Smuzhiyun 	if (input > end_of_input)
125*4882a593Smuzhiyun 		return OVERFLOW_BIT;
126*4882a593Smuzhiyun 	stat = ntohs(*input++);
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 	/*
129*4882a593Smuzhiyun 	 * Now de-compress, it expands one byte to up to 15 bytes
130*4882a593Smuzhiyun 	 * (or fills the remainder of the 64 bytes with zeroes if it
131*4882a593Smuzhiyun 	 * is the last byte to expand).
132*4882a593Smuzhiyun 	 *
133*4882a593Smuzhiyun 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
134*4882a593Smuzhiyun 	 * allow for overflow if the incoming data was malformed.
135*4882a593Smuzhiyun 	 */
136*4882a593Smuzhiyun 	while (dec_count < 8 * 8) {
137*4882a593Smuzhiyun 		s16 in;
138*4882a593Smuzhiyun 		int length;
139*4882a593Smuzhiyun 		int coeff;
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun 		if (input > end_of_input)
142*4882a593Smuzhiyun 			return OVERFLOW_BIT;
143*4882a593Smuzhiyun 		in = ntohs(*input++);
144*4882a593Smuzhiyun 		length = in & 0xf;
145*4882a593Smuzhiyun 		coeff = in >> 4;
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun 		/* fill remainder with zeros */
148*4882a593Smuzhiyun 		if (length == 15) {
149*4882a593Smuzhiyun 			for (i = 0; i < 64 - dec_count; i++)
150*4882a593Smuzhiyun 				*wp++ = 0;
151*4882a593Smuzhiyun 			break;
152*4882a593Smuzhiyun 		}
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun 		for (i = 0; i < length; i++)
155*4882a593Smuzhiyun 			*wp++ = 0;
156*4882a593Smuzhiyun 		*wp++ = coeff;
157*4882a593Smuzhiyun 		dec_count += length + 1;
158*4882a593Smuzhiyun 	}
159*4882a593Smuzhiyun 
160*4882a593Smuzhiyun 	wp = block;
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun 	for (i = 0; i < 64; i++) {
163*4882a593Smuzhiyun 		int pos = zigzag[i];
164*4882a593Smuzhiyun 		int y = pos / 8;
165*4882a593Smuzhiyun 		int x = pos % 8;
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 		dwht_out[x + y * 8] = *wp++;
168*4882a593Smuzhiyun 	}
169*4882a593Smuzhiyun 	*rlc_in = input;
170*4882a593Smuzhiyun 	return stat;
171*4882a593Smuzhiyun }
172*4882a593Smuzhiyun 
173*4882a593Smuzhiyun static const int quant_table[] = {
174*4882a593Smuzhiyun 	2, 2, 2, 2, 2, 2,  2,  2,
175*4882a593Smuzhiyun 	2, 2, 2, 2, 2, 2,  2,  2,
176*4882a593Smuzhiyun 	2, 2, 2, 2, 2, 2,  2,  3,
177*4882a593Smuzhiyun 	2, 2, 2, 2, 2, 2,  3,  6,
178*4882a593Smuzhiyun 	2, 2, 2, 2, 2, 3,  6,  6,
179*4882a593Smuzhiyun 	2, 2, 2, 2, 3, 6,  6,  6,
180*4882a593Smuzhiyun 	2, 2, 2, 3, 6, 6,  6,  6,
181*4882a593Smuzhiyun 	2, 2, 3, 6, 6, 6,  6,  8,
182*4882a593Smuzhiyun };
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun static const int quant_table_p[] = {
185*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 3,  3,  3,
186*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 3,  3,  3,
187*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 3,  3,  3,
188*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 3,  3,  6,
189*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 3,  6,  6,
190*4882a593Smuzhiyun 	3, 3, 3, 3, 3, 6,  6,  9,
191*4882a593Smuzhiyun 	3, 3, 3, 3, 6, 6,  9,  9,
192*4882a593Smuzhiyun 	3, 3, 3, 6, 6, 9,  9,  10,
193*4882a593Smuzhiyun };
194*4882a593Smuzhiyun 
quantize_intra(s16 * coeff,s16 * de_coeff,u16 qp)195*4882a593Smuzhiyun static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun 	const int *quant = quant_table;
198*4882a593Smuzhiyun 	int i, j;
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	for (j = 0; j < 8; j++) {
201*4882a593Smuzhiyun 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
202*4882a593Smuzhiyun 			*coeff >>= *quant;
203*4882a593Smuzhiyun 			if (*coeff >= -qp && *coeff <= qp)
204*4882a593Smuzhiyun 				*coeff = *de_coeff = 0;
205*4882a593Smuzhiyun 			else
206*4882a593Smuzhiyun 				*de_coeff = *coeff << *quant;
207*4882a593Smuzhiyun 		}
208*4882a593Smuzhiyun 	}
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun 
dequantize_intra(s16 * coeff)211*4882a593Smuzhiyun static void dequantize_intra(s16 *coeff)
212*4882a593Smuzhiyun {
213*4882a593Smuzhiyun 	const int *quant = quant_table;
214*4882a593Smuzhiyun 	int i, j;
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 	for (j = 0; j < 8; j++)
217*4882a593Smuzhiyun 		for (i = 0; i < 8; i++, quant++, coeff++)
218*4882a593Smuzhiyun 			*coeff <<= *quant;
219*4882a593Smuzhiyun }
220*4882a593Smuzhiyun 
quantize_inter(s16 * coeff,s16 * de_coeff,u16 qp)221*4882a593Smuzhiyun static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
222*4882a593Smuzhiyun {
223*4882a593Smuzhiyun 	const int *quant = quant_table_p;
224*4882a593Smuzhiyun 	int i, j;
225*4882a593Smuzhiyun 
226*4882a593Smuzhiyun 	for (j = 0; j < 8; j++) {
227*4882a593Smuzhiyun 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
228*4882a593Smuzhiyun 			*coeff >>= *quant;
229*4882a593Smuzhiyun 			if (*coeff >= -qp && *coeff <= qp)
230*4882a593Smuzhiyun 				*coeff = *de_coeff = 0;
231*4882a593Smuzhiyun 			else
232*4882a593Smuzhiyun 				*de_coeff = *coeff << *quant;
233*4882a593Smuzhiyun 		}
234*4882a593Smuzhiyun 	}
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun 
dequantize_inter(s16 * coeff)237*4882a593Smuzhiyun static void dequantize_inter(s16 *coeff)
238*4882a593Smuzhiyun {
239*4882a593Smuzhiyun 	const int *quant = quant_table_p;
240*4882a593Smuzhiyun 	int i, j;
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun 	for (j = 0; j < 8; j++)
243*4882a593Smuzhiyun 		for (i = 0; i < 8; i++, quant++, coeff++)
244*4882a593Smuzhiyun 			*coeff <<= *quant;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun 
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)247*4882a593Smuzhiyun static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
248*4882a593Smuzhiyun 				    unsigned int stride,
249*4882a593Smuzhiyun 				    unsigned int input_step, bool intra)
250*4882a593Smuzhiyun {
251*4882a593Smuzhiyun 	/* we'll need more than 8 bits for the transformed coefficients */
252*4882a593Smuzhiyun 	s32 workspace1[8], workspace2[8];
253*4882a593Smuzhiyun 	const u8 *tmp = block;
254*4882a593Smuzhiyun 	s16 *out = output_block;
255*4882a593Smuzhiyun 	int add = intra ? 256 : 0;
256*4882a593Smuzhiyun 	unsigned int i;
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun 	/* stage 1 */
259*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
260*4882a593Smuzhiyun 		switch (input_step) {
261*4882a593Smuzhiyun 		case 1:
262*4882a593Smuzhiyun 			workspace1[0]  = tmp[0] + tmp[1] - add;
263*4882a593Smuzhiyun 			workspace1[1]  = tmp[0] - tmp[1];
264*4882a593Smuzhiyun 
265*4882a593Smuzhiyun 			workspace1[2]  = tmp[2] + tmp[3] - add;
266*4882a593Smuzhiyun 			workspace1[3]  = tmp[2] - tmp[3];
267*4882a593Smuzhiyun 
268*4882a593Smuzhiyun 			workspace1[4]  = tmp[4] + tmp[5] - add;
269*4882a593Smuzhiyun 			workspace1[5]  = tmp[4] - tmp[5];
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun 			workspace1[6]  = tmp[6] + tmp[7] - add;
272*4882a593Smuzhiyun 			workspace1[7]  = tmp[6] - tmp[7];
273*4882a593Smuzhiyun 			break;
274*4882a593Smuzhiyun 		case 2:
275*4882a593Smuzhiyun 			workspace1[0]  = tmp[0] + tmp[2] - add;
276*4882a593Smuzhiyun 			workspace1[1]  = tmp[0] - tmp[2];
277*4882a593Smuzhiyun 
278*4882a593Smuzhiyun 			workspace1[2]  = tmp[4] + tmp[6] - add;
279*4882a593Smuzhiyun 			workspace1[3]  = tmp[4] - tmp[6];
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 			workspace1[4]  = tmp[8] + tmp[10] - add;
282*4882a593Smuzhiyun 			workspace1[5]  = tmp[8] - tmp[10];
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun 			workspace1[6]  = tmp[12] + tmp[14] - add;
285*4882a593Smuzhiyun 			workspace1[7]  = tmp[12] - tmp[14];
286*4882a593Smuzhiyun 			break;
287*4882a593Smuzhiyun 		case 3:
288*4882a593Smuzhiyun 			workspace1[0]  = tmp[0] + tmp[3] - add;
289*4882a593Smuzhiyun 			workspace1[1]  = tmp[0] - tmp[3];
290*4882a593Smuzhiyun 
291*4882a593Smuzhiyun 			workspace1[2]  = tmp[6] + tmp[9] - add;
292*4882a593Smuzhiyun 			workspace1[3]  = tmp[6] - tmp[9];
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun 			workspace1[4]  = tmp[12] + tmp[15] - add;
295*4882a593Smuzhiyun 			workspace1[5]  = tmp[12] - tmp[15];
296*4882a593Smuzhiyun 
297*4882a593Smuzhiyun 			workspace1[6]  = tmp[18] + tmp[21] - add;
298*4882a593Smuzhiyun 			workspace1[7]  = tmp[18] - tmp[21];
299*4882a593Smuzhiyun 			break;
300*4882a593Smuzhiyun 		default:
301*4882a593Smuzhiyun 			workspace1[0]  = tmp[0] + tmp[4] - add;
302*4882a593Smuzhiyun 			workspace1[1]  = tmp[0] - tmp[4];
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun 			workspace1[2]  = tmp[8] + tmp[12] - add;
305*4882a593Smuzhiyun 			workspace1[3]  = tmp[8] - tmp[12];
306*4882a593Smuzhiyun 
307*4882a593Smuzhiyun 			workspace1[4]  = tmp[16] + tmp[20] - add;
308*4882a593Smuzhiyun 			workspace1[5]  = tmp[16] - tmp[20];
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun 			workspace1[6]  = tmp[24] + tmp[28] - add;
311*4882a593Smuzhiyun 			workspace1[7]  = tmp[24] - tmp[28];
312*4882a593Smuzhiyun 			break;
313*4882a593Smuzhiyun 		}
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun 		/* stage 2 */
316*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
317*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
318*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
319*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
320*4882a593Smuzhiyun 
321*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
322*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
323*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
324*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
325*4882a593Smuzhiyun 
326*4882a593Smuzhiyun 		/* stage 3 */
327*4882a593Smuzhiyun 		out[0] = workspace2[0] + workspace2[4];
328*4882a593Smuzhiyun 		out[1] = workspace2[0] - workspace2[4];
329*4882a593Smuzhiyun 		out[2] = workspace2[1] - workspace2[5];
330*4882a593Smuzhiyun 		out[3] = workspace2[1] + workspace2[5];
331*4882a593Smuzhiyun 		out[4] = workspace2[2] + workspace2[6];
332*4882a593Smuzhiyun 		out[5] = workspace2[2] - workspace2[6];
333*4882a593Smuzhiyun 		out[6] = workspace2[3] - workspace2[7];
334*4882a593Smuzhiyun 		out[7] = workspace2[3] + workspace2[7];
335*4882a593Smuzhiyun 	}
336*4882a593Smuzhiyun 
337*4882a593Smuzhiyun 	out = output_block;
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, out++) {
340*4882a593Smuzhiyun 		/* stage 1 */
341*4882a593Smuzhiyun 		workspace1[0]  = out[0] + out[1 * 8];
342*4882a593Smuzhiyun 		workspace1[1]  = out[0] - out[1 * 8];
343*4882a593Smuzhiyun 
344*4882a593Smuzhiyun 		workspace1[2]  = out[2 * 8] + out[3 * 8];
345*4882a593Smuzhiyun 		workspace1[3]  = out[2 * 8] - out[3 * 8];
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun 		workspace1[4]  = out[4 * 8] + out[5 * 8];
348*4882a593Smuzhiyun 		workspace1[5]  = out[4 * 8] - out[5 * 8];
349*4882a593Smuzhiyun 
350*4882a593Smuzhiyun 		workspace1[6]  = out[6 * 8] + out[7 * 8];
351*4882a593Smuzhiyun 		workspace1[7]  = out[6 * 8] - out[7 * 8];
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun 		/* stage 2 */
354*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
355*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
356*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
357*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
358*4882a593Smuzhiyun 
359*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
360*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
361*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
362*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
363*4882a593Smuzhiyun 		/* stage 3 */
364*4882a593Smuzhiyun 		out[0 * 8] = workspace2[0] + workspace2[4];
365*4882a593Smuzhiyun 		out[1 * 8] = workspace2[0] - workspace2[4];
366*4882a593Smuzhiyun 		out[2 * 8] = workspace2[1] - workspace2[5];
367*4882a593Smuzhiyun 		out[3 * 8] = workspace2[1] + workspace2[5];
368*4882a593Smuzhiyun 		out[4 * 8] = workspace2[2] + workspace2[6];
369*4882a593Smuzhiyun 		out[5 * 8] = workspace2[2] - workspace2[6];
370*4882a593Smuzhiyun 		out[6 * 8] = workspace2[3] - workspace2[7];
371*4882a593Smuzhiyun 		out[7 * 8] = workspace2[3] + workspace2[7];
372*4882a593Smuzhiyun 	}
373*4882a593Smuzhiyun }
374*4882a593Smuzhiyun 
375*4882a593Smuzhiyun /*
376*4882a593Smuzhiyun  * Not the nicest way of doing it, but P-blocks get twice the range of
377*4882a593Smuzhiyun  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
378*4882a593Smuzhiyun  * Furthermore values can be negative... This is just a version that
379*4882a593Smuzhiyun  * works with 16 signed data
380*4882a593Smuzhiyun  */
381*4882a593Smuzhiyun static void noinline_for_stack
fwht16(const s16 * block,s16 * output_block,int stride,int intra)382*4882a593Smuzhiyun fwht16(const s16 *block, s16 *output_block, int stride, int intra)
383*4882a593Smuzhiyun {
384*4882a593Smuzhiyun 	/* we'll need more than 8 bits for the transformed coefficients */
385*4882a593Smuzhiyun 	s32 workspace1[8], workspace2[8];
386*4882a593Smuzhiyun 	const s16 *tmp = block;
387*4882a593Smuzhiyun 	s16 *out = output_block;
388*4882a593Smuzhiyun 	int i;
389*4882a593Smuzhiyun 
390*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
391*4882a593Smuzhiyun 		/* stage 1 */
392*4882a593Smuzhiyun 		workspace1[0]  = tmp[0] + tmp[1];
393*4882a593Smuzhiyun 		workspace1[1]  = tmp[0] - tmp[1];
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun 		workspace1[2]  = tmp[2] + tmp[3];
396*4882a593Smuzhiyun 		workspace1[3]  = tmp[2] - tmp[3];
397*4882a593Smuzhiyun 
398*4882a593Smuzhiyun 		workspace1[4]  = tmp[4] + tmp[5];
399*4882a593Smuzhiyun 		workspace1[5]  = tmp[4] - tmp[5];
400*4882a593Smuzhiyun 
401*4882a593Smuzhiyun 		workspace1[6]  = tmp[6] + tmp[7];
402*4882a593Smuzhiyun 		workspace1[7]  = tmp[6] - tmp[7];
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun 		/* stage 2 */
405*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
406*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
407*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
408*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
409*4882a593Smuzhiyun 
410*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
411*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
412*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
413*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
414*4882a593Smuzhiyun 
415*4882a593Smuzhiyun 		/* stage 3 */
416*4882a593Smuzhiyun 		out[0] = workspace2[0] + workspace2[4];
417*4882a593Smuzhiyun 		out[1] = workspace2[0] - workspace2[4];
418*4882a593Smuzhiyun 		out[2] = workspace2[1] - workspace2[5];
419*4882a593Smuzhiyun 		out[3] = workspace2[1] + workspace2[5];
420*4882a593Smuzhiyun 		out[4] = workspace2[2] + workspace2[6];
421*4882a593Smuzhiyun 		out[5] = workspace2[2] - workspace2[6];
422*4882a593Smuzhiyun 		out[6] = workspace2[3] - workspace2[7];
423*4882a593Smuzhiyun 		out[7] = workspace2[3] + workspace2[7];
424*4882a593Smuzhiyun 	}
425*4882a593Smuzhiyun 
426*4882a593Smuzhiyun 	out = output_block;
427*4882a593Smuzhiyun 
428*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, out++) {
429*4882a593Smuzhiyun 		/* stage 1 */
430*4882a593Smuzhiyun 		workspace1[0]  = out[0] + out[1*8];
431*4882a593Smuzhiyun 		workspace1[1]  = out[0] - out[1*8];
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun 		workspace1[2]  = out[2*8] + out[3*8];
434*4882a593Smuzhiyun 		workspace1[3]  = out[2*8] - out[3*8];
435*4882a593Smuzhiyun 
436*4882a593Smuzhiyun 		workspace1[4]  = out[4*8] + out[5*8];
437*4882a593Smuzhiyun 		workspace1[5]  = out[4*8] - out[5*8];
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 		workspace1[6]  = out[6*8] + out[7*8];
440*4882a593Smuzhiyun 		workspace1[7]  = out[6*8] - out[7*8];
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 		/* stage 2 */
443*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
444*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
445*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
446*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
447*4882a593Smuzhiyun 
448*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
449*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
450*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
451*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
452*4882a593Smuzhiyun 
453*4882a593Smuzhiyun 		/* stage 3 */
454*4882a593Smuzhiyun 		out[0*8] = workspace2[0] + workspace2[4];
455*4882a593Smuzhiyun 		out[1*8] = workspace2[0] - workspace2[4];
456*4882a593Smuzhiyun 		out[2*8] = workspace2[1] - workspace2[5];
457*4882a593Smuzhiyun 		out[3*8] = workspace2[1] + workspace2[5];
458*4882a593Smuzhiyun 		out[4*8] = workspace2[2] + workspace2[6];
459*4882a593Smuzhiyun 		out[5*8] = workspace2[2] - workspace2[6];
460*4882a593Smuzhiyun 		out[6*8] = workspace2[3] - workspace2[7];
461*4882a593Smuzhiyun 		out[7*8] = workspace2[3] + workspace2[7];
462*4882a593Smuzhiyun 	}
463*4882a593Smuzhiyun }
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun static noinline_for_stack void
ifwht(const s16 * block,s16 * output_block,int intra)466*4882a593Smuzhiyun ifwht(const s16 *block, s16 *output_block, int intra)
467*4882a593Smuzhiyun {
468*4882a593Smuzhiyun 	/*
469*4882a593Smuzhiyun 	 * we'll need more than 8 bits for the transformed coefficients
470*4882a593Smuzhiyun 	 * use native unit of cpu
471*4882a593Smuzhiyun 	 */
472*4882a593Smuzhiyun 	int workspace1[8], workspace2[8];
473*4882a593Smuzhiyun 	int inter = intra ? 0 : 1;
474*4882a593Smuzhiyun 	const s16 *tmp = block;
475*4882a593Smuzhiyun 	s16 *out = output_block;
476*4882a593Smuzhiyun 	int i;
477*4882a593Smuzhiyun 
478*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
479*4882a593Smuzhiyun 		/* stage 1 */
480*4882a593Smuzhiyun 		workspace1[0]  = tmp[0] + tmp[1];
481*4882a593Smuzhiyun 		workspace1[1]  = tmp[0] - tmp[1];
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 		workspace1[2]  = tmp[2] + tmp[3];
484*4882a593Smuzhiyun 		workspace1[3]  = tmp[2] - tmp[3];
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun 		workspace1[4]  = tmp[4] + tmp[5];
487*4882a593Smuzhiyun 		workspace1[5]  = tmp[4] - tmp[5];
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 		workspace1[6]  = tmp[6] + tmp[7];
490*4882a593Smuzhiyun 		workspace1[7]  = tmp[6] - tmp[7];
491*4882a593Smuzhiyun 
492*4882a593Smuzhiyun 		/* stage 2 */
493*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
494*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
495*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
496*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
497*4882a593Smuzhiyun 
498*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
499*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
500*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
501*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
502*4882a593Smuzhiyun 
503*4882a593Smuzhiyun 		/* stage 3 */
504*4882a593Smuzhiyun 		out[0] = workspace2[0] + workspace2[4];
505*4882a593Smuzhiyun 		out[1] = workspace2[0] - workspace2[4];
506*4882a593Smuzhiyun 		out[2] = workspace2[1] - workspace2[5];
507*4882a593Smuzhiyun 		out[3] = workspace2[1] + workspace2[5];
508*4882a593Smuzhiyun 		out[4] = workspace2[2] + workspace2[6];
509*4882a593Smuzhiyun 		out[5] = workspace2[2] - workspace2[6];
510*4882a593Smuzhiyun 		out[6] = workspace2[3] - workspace2[7];
511*4882a593Smuzhiyun 		out[7] = workspace2[3] + workspace2[7];
512*4882a593Smuzhiyun 	}
513*4882a593Smuzhiyun 
514*4882a593Smuzhiyun 	out = output_block;
515*4882a593Smuzhiyun 
516*4882a593Smuzhiyun 	for (i = 0; i < 8; i++, out++) {
517*4882a593Smuzhiyun 		/* stage 1 */
518*4882a593Smuzhiyun 		workspace1[0]  = out[0] + out[1 * 8];
519*4882a593Smuzhiyun 		workspace1[1]  = out[0] - out[1 * 8];
520*4882a593Smuzhiyun 
521*4882a593Smuzhiyun 		workspace1[2]  = out[2 * 8] + out[3 * 8];
522*4882a593Smuzhiyun 		workspace1[3]  = out[2 * 8] - out[3 * 8];
523*4882a593Smuzhiyun 
524*4882a593Smuzhiyun 		workspace1[4]  = out[4 * 8] + out[5 * 8];
525*4882a593Smuzhiyun 		workspace1[5]  = out[4 * 8] - out[5 * 8];
526*4882a593Smuzhiyun 
527*4882a593Smuzhiyun 		workspace1[6]  = out[6 * 8] + out[7 * 8];
528*4882a593Smuzhiyun 		workspace1[7]  = out[6 * 8] - out[7 * 8];
529*4882a593Smuzhiyun 
530*4882a593Smuzhiyun 		/* stage 2 */
531*4882a593Smuzhiyun 		workspace2[0] = workspace1[0] + workspace1[2];
532*4882a593Smuzhiyun 		workspace2[1] = workspace1[0] - workspace1[2];
533*4882a593Smuzhiyun 		workspace2[2] = workspace1[1] - workspace1[3];
534*4882a593Smuzhiyun 		workspace2[3] = workspace1[1] + workspace1[3];
535*4882a593Smuzhiyun 
536*4882a593Smuzhiyun 		workspace2[4] = workspace1[4] + workspace1[6];
537*4882a593Smuzhiyun 		workspace2[5] = workspace1[4] - workspace1[6];
538*4882a593Smuzhiyun 		workspace2[6] = workspace1[5] - workspace1[7];
539*4882a593Smuzhiyun 		workspace2[7] = workspace1[5] + workspace1[7];
540*4882a593Smuzhiyun 
541*4882a593Smuzhiyun 		/* stage 3 */
542*4882a593Smuzhiyun 		if (inter) {
543*4882a593Smuzhiyun 			int d;
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 			out[0 * 8] = workspace2[0] + workspace2[4];
546*4882a593Smuzhiyun 			out[1 * 8] = workspace2[0] - workspace2[4];
547*4882a593Smuzhiyun 			out[2 * 8] = workspace2[1] - workspace2[5];
548*4882a593Smuzhiyun 			out[3 * 8] = workspace2[1] + workspace2[5];
549*4882a593Smuzhiyun 			out[4 * 8] = workspace2[2] + workspace2[6];
550*4882a593Smuzhiyun 			out[5 * 8] = workspace2[2] - workspace2[6];
551*4882a593Smuzhiyun 			out[6 * 8] = workspace2[3] - workspace2[7];
552*4882a593Smuzhiyun 			out[7 * 8] = workspace2[3] + workspace2[7];
553*4882a593Smuzhiyun 
554*4882a593Smuzhiyun 			for (d = 0; d < 8; d++)
555*4882a593Smuzhiyun 				out[8 * d] >>= 6;
556*4882a593Smuzhiyun 		} else {
557*4882a593Smuzhiyun 			int d;
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun 			out[0 * 8] = workspace2[0] + workspace2[4];
560*4882a593Smuzhiyun 			out[1 * 8] = workspace2[0] - workspace2[4];
561*4882a593Smuzhiyun 			out[2 * 8] = workspace2[1] - workspace2[5];
562*4882a593Smuzhiyun 			out[3 * 8] = workspace2[1] + workspace2[5];
563*4882a593Smuzhiyun 			out[4 * 8] = workspace2[2] + workspace2[6];
564*4882a593Smuzhiyun 			out[5 * 8] = workspace2[2] - workspace2[6];
565*4882a593Smuzhiyun 			out[6 * 8] = workspace2[3] - workspace2[7];
566*4882a593Smuzhiyun 			out[7 * 8] = workspace2[3] + workspace2[7];
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun 			for (d = 0; d < 8; d++) {
569*4882a593Smuzhiyun 				out[8 * d] >>= 6;
570*4882a593Smuzhiyun 				out[8 * d] += 128;
571*4882a593Smuzhiyun 			}
572*4882a593Smuzhiyun 		}
573*4882a593Smuzhiyun 	}
574*4882a593Smuzhiyun }
575*4882a593Smuzhiyun 
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)576*4882a593Smuzhiyun static void fill_encoder_block(const u8 *input, s16 *dst,
577*4882a593Smuzhiyun 			       unsigned int stride, unsigned int input_step)
578*4882a593Smuzhiyun {
579*4882a593Smuzhiyun 	int i, j;
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	for (i = 0; i < 8; i++) {
582*4882a593Smuzhiyun 		for (j = 0; j < 8; j++, input += input_step)
583*4882a593Smuzhiyun 			*dst++ = *input;
584*4882a593Smuzhiyun 		input += stride - 8 * input_step;
585*4882a593Smuzhiyun 	}
586*4882a593Smuzhiyun }
587*4882a593Smuzhiyun 
var_intra(const s16 * input)588*4882a593Smuzhiyun static int var_intra(const s16 *input)
589*4882a593Smuzhiyun {
590*4882a593Smuzhiyun 	int32_t mean = 0;
591*4882a593Smuzhiyun 	int32_t ret = 0;
592*4882a593Smuzhiyun 	const s16 *tmp = input;
593*4882a593Smuzhiyun 	int i;
594*4882a593Smuzhiyun 
595*4882a593Smuzhiyun 	for (i = 0; i < 8 * 8; i++, tmp++)
596*4882a593Smuzhiyun 		mean += *tmp;
597*4882a593Smuzhiyun 	mean /= 64;
598*4882a593Smuzhiyun 	tmp = input;
599*4882a593Smuzhiyun 	for (i = 0; i < 8 * 8; i++, tmp++)
600*4882a593Smuzhiyun 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
601*4882a593Smuzhiyun 	return ret;
602*4882a593Smuzhiyun }
603*4882a593Smuzhiyun 
var_inter(const s16 * old,const s16 * new)604*4882a593Smuzhiyun static int var_inter(const s16 *old, const s16 *new)
605*4882a593Smuzhiyun {
606*4882a593Smuzhiyun 	int32_t ret = 0;
607*4882a593Smuzhiyun 	int i;
608*4882a593Smuzhiyun 
609*4882a593Smuzhiyun 	for (i = 0; i < 8 * 8; i++, old++, new++)
610*4882a593Smuzhiyun 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
611*4882a593Smuzhiyun 	return ret;
612*4882a593Smuzhiyun }
613*4882a593Smuzhiyun 
614*4882a593Smuzhiyun static noinline_for_stack int
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)615*4882a593Smuzhiyun decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
616*4882a593Smuzhiyun 		 unsigned int stride, unsigned int input_step)
617*4882a593Smuzhiyun {
618*4882a593Smuzhiyun 	s16 tmp[64];
619*4882a593Smuzhiyun 	s16 old[64];
620*4882a593Smuzhiyun 	s16 *work = tmp;
621*4882a593Smuzhiyun 	unsigned int k, l;
622*4882a593Smuzhiyun 	int vari;
623*4882a593Smuzhiyun 	int vard;
624*4882a593Smuzhiyun 
625*4882a593Smuzhiyun 	fill_encoder_block(cur, tmp, stride, input_step);
626*4882a593Smuzhiyun 	fill_encoder_block(reference, old, 8, 1);
627*4882a593Smuzhiyun 	vari = var_intra(tmp);
628*4882a593Smuzhiyun 
629*4882a593Smuzhiyun 	for (k = 0; k < 8; k++) {
630*4882a593Smuzhiyun 		for (l = 0; l < 8; l++) {
631*4882a593Smuzhiyun 			*deltablock = *work - *reference;
632*4882a593Smuzhiyun 			deltablock++;
633*4882a593Smuzhiyun 			work++;
634*4882a593Smuzhiyun 			reference++;
635*4882a593Smuzhiyun 		}
636*4882a593Smuzhiyun 	}
637*4882a593Smuzhiyun 	deltablock -= 64;
638*4882a593Smuzhiyun 	vard = var_inter(old, tmp);
639*4882a593Smuzhiyun 	return vari <= vard ? IBLOCK : PBLOCK;
640*4882a593Smuzhiyun }
641*4882a593Smuzhiyun 
fill_decoder_block(u8 * dst,const s16 * input,int stride,unsigned int dst_step)642*4882a593Smuzhiyun static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
643*4882a593Smuzhiyun 			       unsigned int dst_step)
644*4882a593Smuzhiyun {
645*4882a593Smuzhiyun 	int i, j;
646*4882a593Smuzhiyun 
647*4882a593Smuzhiyun 	for (i = 0; i < 8; i++) {
648*4882a593Smuzhiyun 		for (j = 0; j < 8; j++, input++, dst += dst_step) {
649*4882a593Smuzhiyun 			if (*input < 0)
650*4882a593Smuzhiyun 				*dst = 0;
651*4882a593Smuzhiyun 			else if (*input > 255)
652*4882a593Smuzhiyun 				*dst = 255;
653*4882a593Smuzhiyun 			else
654*4882a593Smuzhiyun 				*dst = *input;
655*4882a593Smuzhiyun 		}
656*4882a593Smuzhiyun 		dst += stride - (8 * dst_step);
657*4882a593Smuzhiyun 	}
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun 
add_deltas(s16 * deltas,const u8 * ref,int stride,unsigned int ref_step)660*4882a593Smuzhiyun static void add_deltas(s16 *deltas, const u8 *ref, int stride,
661*4882a593Smuzhiyun 		       unsigned int ref_step)
662*4882a593Smuzhiyun {
663*4882a593Smuzhiyun 	int k, l;
664*4882a593Smuzhiyun 
665*4882a593Smuzhiyun 	for (k = 0; k < 8; k++) {
666*4882a593Smuzhiyun 		for (l = 0; l < 8; l++) {
667*4882a593Smuzhiyun 			*deltas += *ref;
668*4882a593Smuzhiyun 			ref += ref_step;
669*4882a593Smuzhiyun 			/*
670*4882a593Smuzhiyun 			 * Due to quantizing, it might possible that the
671*4882a593Smuzhiyun 			 * decoded coefficients are slightly out of range
672*4882a593Smuzhiyun 			 */
673*4882a593Smuzhiyun 			if (*deltas < 0)
674*4882a593Smuzhiyun 				*deltas = 0;
675*4882a593Smuzhiyun 			else if (*deltas > 255)
676*4882a593Smuzhiyun 				*deltas = 255;
677*4882a593Smuzhiyun 			deltas++;
678*4882a593Smuzhiyun 		}
679*4882a593Smuzhiyun 		ref += stride - (8 * ref_step);
680*4882a593Smuzhiyun 	}
681*4882a593Smuzhiyun }
682*4882a593Smuzhiyun 
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct fwht_cframe * cf,u32 height,u32 width,u32 stride,unsigned int input_step,bool is_intra,bool next_is_intra)683*4882a593Smuzhiyun static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
684*4882a593Smuzhiyun 			struct fwht_cframe *cf, u32 height, u32 width,
685*4882a593Smuzhiyun 			u32 stride, unsigned int input_step,
686*4882a593Smuzhiyun 			bool is_intra, bool next_is_intra)
687*4882a593Smuzhiyun {
688*4882a593Smuzhiyun 	u8 *input_start = input;
689*4882a593Smuzhiyun 	__be16 *rlco_start = *rlco;
690*4882a593Smuzhiyun 	s16 deltablock[64];
691*4882a593Smuzhiyun 	__be16 pframe_bit = htons(PFRAME_BIT);
692*4882a593Smuzhiyun 	u32 encoding = 0;
693*4882a593Smuzhiyun 	unsigned int last_size = 0;
694*4882a593Smuzhiyun 	unsigned int i, j;
695*4882a593Smuzhiyun 
696*4882a593Smuzhiyun 	width = round_up(width, 8);
697*4882a593Smuzhiyun 	height = round_up(height, 8);
698*4882a593Smuzhiyun 
699*4882a593Smuzhiyun 	for (j = 0; j < height / 8; j++) {
700*4882a593Smuzhiyun 		input = input_start + j * 8 * stride;
701*4882a593Smuzhiyun 		for (i = 0; i < width / 8; i++) {
702*4882a593Smuzhiyun 			/* intra code, first frame is always intra coded. */
703*4882a593Smuzhiyun 			int blocktype = IBLOCK;
704*4882a593Smuzhiyun 			unsigned int size;
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 			if (!is_intra)
707*4882a593Smuzhiyun 				blocktype = decide_blocktype(input, refp,
708*4882a593Smuzhiyun 					deltablock, stride, input_step);
709*4882a593Smuzhiyun 			if (blocktype == IBLOCK) {
710*4882a593Smuzhiyun 				fwht(input, cf->coeffs, stride, input_step, 1);
711*4882a593Smuzhiyun 				quantize_intra(cf->coeffs, cf->de_coeffs,
712*4882a593Smuzhiyun 					       cf->i_frame_qp);
713*4882a593Smuzhiyun 			} else {
714*4882a593Smuzhiyun 				/* inter code */
715*4882a593Smuzhiyun 				encoding |= FWHT_FRAME_PCODED;
716*4882a593Smuzhiyun 				fwht16(deltablock, cf->coeffs, 8, 0);
717*4882a593Smuzhiyun 				quantize_inter(cf->coeffs, cf->de_coeffs,
718*4882a593Smuzhiyun 					       cf->p_frame_qp);
719*4882a593Smuzhiyun 			}
720*4882a593Smuzhiyun 			if (!next_is_intra) {
721*4882a593Smuzhiyun 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
722*4882a593Smuzhiyun 
723*4882a593Smuzhiyun 				if (blocktype == PBLOCK)
724*4882a593Smuzhiyun 					add_deltas(cf->de_fwht, refp, 8, 1);
725*4882a593Smuzhiyun 				fill_decoder_block(refp, cf->de_fwht, 8, 1);
726*4882a593Smuzhiyun 			}
727*4882a593Smuzhiyun 
728*4882a593Smuzhiyun 			input += 8 * input_step;
729*4882a593Smuzhiyun 			refp += 8 * 8;
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 			size = rlc(cf->coeffs, *rlco, blocktype);
732*4882a593Smuzhiyun 			if (last_size == size &&
733*4882a593Smuzhiyun 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
734*4882a593Smuzhiyun 				__be16 *last_rlco = *rlco - size;
735*4882a593Smuzhiyun 				s16 hdr = ntohs(*last_rlco);
736*4882a593Smuzhiyun 
737*4882a593Smuzhiyun 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
738*4882a593Smuzhiyun 				    (hdr & DUPS_MASK) < DUPS_MASK)
739*4882a593Smuzhiyun 					*last_rlco = htons(hdr + 2);
740*4882a593Smuzhiyun 				else
741*4882a593Smuzhiyun 					*rlco += size;
742*4882a593Smuzhiyun 			} else {
743*4882a593Smuzhiyun 				*rlco += size;
744*4882a593Smuzhiyun 			}
745*4882a593Smuzhiyun 			if (*rlco >= rlco_max) {
746*4882a593Smuzhiyun 				encoding |= FWHT_FRAME_UNENCODED;
747*4882a593Smuzhiyun 				goto exit_loop;
748*4882a593Smuzhiyun 			}
749*4882a593Smuzhiyun 			last_size = size;
750*4882a593Smuzhiyun 		}
751*4882a593Smuzhiyun 	}
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun exit_loop:
754*4882a593Smuzhiyun 	if (encoding & FWHT_FRAME_UNENCODED) {
755*4882a593Smuzhiyun 		u8 *out = (u8 *)rlco_start;
756*4882a593Smuzhiyun 		u8 *p;
757*4882a593Smuzhiyun 
758*4882a593Smuzhiyun 		input = input_start;
759*4882a593Smuzhiyun 		/*
760*4882a593Smuzhiyun 		 * The compressed stream should never contain the magic
761*4882a593Smuzhiyun 		 * header, so when we copy the YUV data we replace 0xff
762*4882a593Smuzhiyun 		 * by 0xfe. Since YUV is limited range such values
763*4882a593Smuzhiyun 		 * shouldn't appear anyway.
764*4882a593Smuzhiyun 		 */
765*4882a593Smuzhiyun 		for (j = 0; j < height; j++) {
766*4882a593Smuzhiyun 			for (i = 0, p = input; i < width; i++, p += input_step)
767*4882a593Smuzhiyun 				*out++ = (*p == 0xff) ? 0xfe : *p;
768*4882a593Smuzhiyun 			input += stride;
769*4882a593Smuzhiyun 		}
770*4882a593Smuzhiyun 		*rlco = (__be16 *)out;
771*4882a593Smuzhiyun 		encoding &= ~FWHT_FRAME_PCODED;
772*4882a593Smuzhiyun 	}
773*4882a593Smuzhiyun 	return encoding;
774*4882a593Smuzhiyun }
775*4882a593Smuzhiyun 
fwht_encode_frame(struct fwht_raw_frame * frm,struct fwht_raw_frame * ref_frm,struct fwht_cframe * cf,bool is_intra,bool next_is_intra,unsigned int width,unsigned int height,unsigned int stride,unsigned int chroma_stride)776*4882a593Smuzhiyun u32 fwht_encode_frame(struct fwht_raw_frame *frm,
777*4882a593Smuzhiyun 		      struct fwht_raw_frame *ref_frm,
778*4882a593Smuzhiyun 		      struct fwht_cframe *cf,
779*4882a593Smuzhiyun 		      bool is_intra, bool next_is_intra,
780*4882a593Smuzhiyun 		      unsigned int width, unsigned int height,
781*4882a593Smuzhiyun 		      unsigned int stride, unsigned int chroma_stride)
782*4882a593Smuzhiyun {
783*4882a593Smuzhiyun 	unsigned int size = height * width;
784*4882a593Smuzhiyun 	__be16 *rlco = cf->rlc_data;
785*4882a593Smuzhiyun 	__be16 *rlco_max;
786*4882a593Smuzhiyun 	u32 encoding;
787*4882a593Smuzhiyun 
788*4882a593Smuzhiyun 	rlco_max = rlco + size / 2 - 256;
789*4882a593Smuzhiyun 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
790*4882a593Smuzhiyun 				height, width, stride,
791*4882a593Smuzhiyun 				frm->luma_alpha_step, is_intra, next_is_intra);
792*4882a593Smuzhiyun 	if (encoding & FWHT_FRAME_UNENCODED)
793*4882a593Smuzhiyun 		encoding |= FWHT_LUMA_UNENCODED;
794*4882a593Smuzhiyun 	encoding &= ~FWHT_FRAME_UNENCODED;
795*4882a593Smuzhiyun 
796*4882a593Smuzhiyun 	if (frm->components_num >= 3) {
797*4882a593Smuzhiyun 		u32 chroma_h = height / frm->height_div;
798*4882a593Smuzhiyun 		u32 chroma_w = width / frm->width_div;
799*4882a593Smuzhiyun 		unsigned int chroma_size = chroma_h * chroma_w;
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 		rlco_max = rlco + chroma_size / 2 - 256;
802*4882a593Smuzhiyun 		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
803*4882a593Smuzhiyun 					 cf, chroma_h, chroma_w,
804*4882a593Smuzhiyun 					 chroma_stride, frm->chroma_step,
805*4882a593Smuzhiyun 					 is_intra, next_is_intra);
806*4882a593Smuzhiyun 		if (encoding & FWHT_FRAME_UNENCODED)
807*4882a593Smuzhiyun 			encoding |= FWHT_CB_UNENCODED;
808*4882a593Smuzhiyun 		encoding &= ~FWHT_FRAME_UNENCODED;
809*4882a593Smuzhiyun 		rlco_max = rlco + chroma_size / 2 - 256;
810*4882a593Smuzhiyun 		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
811*4882a593Smuzhiyun 					 cf, chroma_h, chroma_w,
812*4882a593Smuzhiyun 					 chroma_stride, frm->chroma_step,
813*4882a593Smuzhiyun 					 is_intra, next_is_intra);
814*4882a593Smuzhiyun 		if (encoding & FWHT_FRAME_UNENCODED)
815*4882a593Smuzhiyun 			encoding |= FWHT_CR_UNENCODED;
816*4882a593Smuzhiyun 		encoding &= ~FWHT_FRAME_UNENCODED;
817*4882a593Smuzhiyun 	}
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 	if (frm->components_num == 4) {
820*4882a593Smuzhiyun 		rlco_max = rlco + size / 2 - 256;
821*4882a593Smuzhiyun 		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
822*4882a593Smuzhiyun 					 rlco_max, cf, height, width,
823*4882a593Smuzhiyun 					 stride, frm->luma_alpha_step,
824*4882a593Smuzhiyun 					 is_intra, next_is_intra);
825*4882a593Smuzhiyun 		if (encoding & FWHT_FRAME_UNENCODED)
826*4882a593Smuzhiyun 			encoding |= FWHT_ALPHA_UNENCODED;
827*4882a593Smuzhiyun 		encoding &= ~FWHT_FRAME_UNENCODED;
828*4882a593Smuzhiyun 	}
829*4882a593Smuzhiyun 
830*4882a593Smuzhiyun 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
831*4882a593Smuzhiyun 	return encoding;
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun 
decode_plane(struct fwht_cframe * cf,const __be16 ** rlco,u32 height,u32 width,const u8 * ref,u32 ref_stride,unsigned int ref_step,u8 * dst,unsigned int dst_stride,unsigned int dst_step,bool uncompressed,const __be16 * end_of_rlco_buf)834*4882a593Smuzhiyun static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
835*4882a593Smuzhiyun 			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
836*4882a593Smuzhiyun 			 unsigned int ref_step, u8 *dst,
837*4882a593Smuzhiyun 			 unsigned int dst_stride, unsigned int dst_step,
838*4882a593Smuzhiyun 			 bool uncompressed, const __be16 *end_of_rlco_buf)
839*4882a593Smuzhiyun {
840*4882a593Smuzhiyun 	unsigned int copies = 0;
841*4882a593Smuzhiyun 	s16 copy[8 * 8];
842*4882a593Smuzhiyun 	u16 stat;
843*4882a593Smuzhiyun 	unsigned int i, j;
844*4882a593Smuzhiyun 	bool is_intra = !ref;
845*4882a593Smuzhiyun 
846*4882a593Smuzhiyun 	width = round_up(width, 8);
847*4882a593Smuzhiyun 	height = round_up(height, 8);
848*4882a593Smuzhiyun 
849*4882a593Smuzhiyun 	if (uncompressed) {
850*4882a593Smuzhiyun 		int i;
851*4882a593Smuzhiyun 
852*4882a593Smuzhiyun 		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
853*4882a593Smuzhiyun 			return false;
854*4882a593Smuzhiyun 		for (i = 0; i < height; i++) {
855*4882a593Smuzhiyun 			memcpy(dst, *rlco, width);
856*4882a593Smuzhiyun 			dst += dst_stride;
857*4882a593Smuzhiyun 			*rlco += width / 2;
858*4882a593Smuzhiyun 		}
859*4882a593Smuzhiyun 		return true;
860*4882a593Smuzhiyun 	}
861*4882a593Smuzhiyun 
862*4882a593Smuzhiyun 	/*
863*4882a593Smuzhiyun 	 * When decoding each macroblock the rlco pointer will be increased
864*4882a593Smuzhiyun 	 * by 65 * 2 bytes worst-case.
865*4882a593Smuzhiyun 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
866*4882a593Smuzhiyun 	 * image size, just in case someone feeds it malicious data.
867*4882a593Smuzhiyun 	 */
868*4882a593Smuzhiyun 	for (j = 0; j < height / 8; j++) {
869*4882a593Smuzhiyun 		for (i = 0; i < width / 8; i++) {
870*4882a593Smuzhiyun 			const u8 *refp = ref + j * 8 * ref_stride +
871*4882a593Smuzhiyun 				i * 8 * ref_step;
872*4882a593Smuzhiyun 			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
873*4882a593Smuzhiyun 
874*4882a593Smuzhiyun 			if (copies) {
875*4882a593Smuzhiyun 				memcpy(cf->de_fwht, copy, sizeof(copy));
876*4882a593Smuzhiyun 				if ((stat & PFRAME_BIT) && !is_intra)
877*4882a593Smuzhiyun 					add_deltas(cf->de_fwht, refp,
878*4882a593Smuzhiyun 						   ref_stride, ref_step);
879*4882a593Smuzhiyun 				fill_decoder_block(dstp, cf->de_fwht,
880*4882a593Smuzhiyun 						   dst_stride, dst_step);
881*4882a593Smuzhiyun 				copies--;
882*4882a593Smuzhiyun 				continue;
883*4882a593Smuzhiyun 			}
884*4882a593Smuzhiyun 
885*4882a593Smuzhiyun 			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
886*4882a593Smuzhiyun 			if (stat & OVERFLOW_BIT)
887*4882a593Smuzhiyun 				return false;
888*4882a593Smuzhiyun 			if ((stat & PFRAME_BIT) && !is_intra)
889*4882a593Smuzhiyun 				dequantize_inter(cf->coeffs);
890*4882a593Smuzhiyun 			else
891*4882a593Smuzhiyun 				dequantize_intra(cf->coeffs);
892*4882a593Smuzhiyun 
893*4882a593Smuzhiyun 			ifwht(cf->coeffs, cf->de_fwht,
894*4882a593Smuzhiyun 			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
895*4882a593Smuzhiyun 
896*4882a593Smuzhiyun 			copies = (stat & DUPS_MASK) >> 1;
897*4882a593Smuzhiyun 			if (copies)
898*4882a593Smuzhiyun 				memcpy(copy, cf->de_fwht, sizeof(copy));
899*4882a593Smuzhiyun 			if ((stat & PFRAME_BIT) && !is_intra)
900*4882a593Smuzhiyun 				add_deltas(cf->de_fwht, refp,
901*4882a593Smuzhiyun 					   ref_stride, ref_step);
902*4882a593Smuzhiyun 			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
903*4882a593Smuzhiyun 					   dst_step);
904*4882a593Smuzhiyun 		}
905*4882a593Smuzhiyun 	}
906*4882a593Smuzhiyun 	return true;
907*4882a593Smuzhiyun }
908*4882a593Smuzhiyun 
fwht_decode_frame(struct fwht_cframe * cf,u32 hdr_flags,unsigned int components_num,unsigned int width,unsigned int height,const struct fwht_raw_frame * ref,unsigned int ref_stride,unsigned int ref_chroma_stride,struct fwht_raw_frame * dst,unsigned int dst_stride,unsigned int dst_chroma_stride)909*4882a593Smuzhiyun bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
910*4882a593Smuzhiyun 		       unsigned int components_num, unsigned int width,
911*4882a593Smuzhiyun 		       unsigned int height, const struct fwht_raw_frame *ref,
912*4882a593Smuzhiyun 		       unsigned int ref_stride, unsigned int ref_chroma_stride,
913*4882a593Smuzhiyun 		       struct fwht_raw_frame *dst, unsigned int dst_stride,
914*4882a593Smuzhiyun 		       unsigned int dst_chroma_stride)
915*4882a593Smuzhiyun {
916*4882a593Smuzhiyun 	const __be16 *rlco = cf->rlc_data;
917*4882a593Smuzhiyun 	const __be16 *end_of_rlco_buf = cf->rlc_data +
918*4882a593Smuzhiyun 			(cf->size / sizeof(*rlco)) - 1;
919*4882a593Smuzhiyun 
920*4882a593Smuzhiyun 	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
921*4882a593Smuzhiyun 			  ref->luma_alpha_step, dst->luma, dst_stride,
922*4882a593Smuzhiyun 			  dst->luma_alpha_step,
923*4882a593Smuzhiyun 			  hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED,
924*4882a593Smuzhiyun 			  end_of_rlco_buf))
925*4882a593Smuzhiyun 		return false;
926*4882a593Smuzhiyun 
927*4882a593Smuzhiyun 	if (components_num >= 3) {
928*4882a593Smuzhiyun 		u32 h = height;
929*4882a593Smuzhiyun 		u32 w = width;
930*4882a593Smuzhiyun 
931*4882a593Smuzhiyun 		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT))
932*4882a593Smuzhiyun 			h /= 2;
933*4882a593Smuzhiyun 		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH))
934*4882a593Smuzhiyun 			w /= 2;
935*4882a593Smuzhiyun 
936*4882a593Smuzhiyun 		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
937*4882a593Smuzhiyun 				  ref->chroma_step, dst->cb, dst_chroma_stride,
938*4882a593Smuzhiyun 				  dst->chroma_step,
939*4882a593Smuzhiyun 				  hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED,
940*4882a593Smuzhiyun 				  end_of_rlco_buf))
941*4882a593Smuzhiyun 			return false;
942*4882a593Smuzhiyun 		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
943*4882a593Smuzhiyun 				  ref->chroma_step, dst->cr, dst_chroma_stride,
944*4882a593Smuzhiyun 				  dst->chroma_step,
945*4882a593Smuzhiyun 				  hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED,
946*4882a593Smuzhiyun 				  end_of_rlco_buf))
947*4882a593Smuzhiyun 			return false;
948*4882a593Smuzhiyun 	}
949*4882a593Smuzhiyun 
950*4882a593Smuzhiyun 	if (components_num == 4)
951*4882a593Smuzhiyun 		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
952*4882a593Smuzhiyun 				  ref->luma_alpha_step, dst->alpha, dst_stride,
953*4882a593Smuzhiyun 				  dst->luma_alpha_step,
954*4882a593Smuzhiyun 				  hdr_flags & FWHT_FL_ALPHA_IS_UNCOMPRESSED,
955*4882a593Smuzhiyun 				  end_of_rlco_buf))
956*4882a593Smuzhiyun 			return false;
957*4882a593Smuzhiyun 	return true;
958*4882a593Smuzhiyun }
959