xref: /OK3568_Linux_fs/kernel/drivers/gpu/drm/vc4/vc4_validate_shaders.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright © 2014 Broadcom
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * Permission is hereby granted, free of charge, to any person obtaining a
5*4882a593Smuzhiyun  * copy of this software and associated documentation files (the "Software"),
6*4882a593Smuzhiyun  * to deal in the Software without restriction, including without limitation
7*4882a593Smuzhiyun  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8*4882a593Smuzhiyun  * and/or sell copies of the Software, and to permit persons to whom the
9*4882a593Smuzhiyun  * Software is furnished to do so, subject to the following conditions:
10*4882a593Smuzhiyun  *
11*4882a593Smuzhiyun  * The above copyright notice and this permission notice (including the next
12*4882a593Smuzhiyun  * paragraph) shall be included in all copies or substantial portions of the
13*4882a593Smuzhiyun  * Software.
14*4882a593Smuzhiyun  *
15*4882a593Smuzhiyun  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16*4882a593Smuzhiyun  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17*4882a593Smuzhiyun  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18*4882a593Smuzhiyun  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19*4882a593Smuzhiyun  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20*4882a593Smuzhiyun  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21*4882a593Smuzhiyun  * IN THE SOFTWARE.
22*4882a593Smuzhiyun  */
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun /**
25*4882a593Smuzhiyun  * DOC: Shader validator for VC4.
26*4882a593Smuzhiyun  *
27*4882a593Smuzhiyun  * Since the VC4 has no IOMMU between it and system memory, a user
28*4882a593Smuzhiyun  * with access to execute shaders could escalate privilege by
29*4882a593Smuzhiyun  * overwriting system memory (using the VPM write address register in
30*4882a593Smuzhiyun  * the general-purpose DMA mode) or reading system memory it shouldn't
31*4882a593Smuzhiyun  * (reading it as a texture, uniform data, or direct-addressed TMU
32*4882a593Smuzhiyun  * lookup).
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * The shader validator walks over a shader's BO, ensuring that its
35*4882a593Smuzhiyun  * accesses are appropriately bounded, and recording where texture
36*4882a593Smuzhiyun  * accesses are made so that we can do relocations for them in the
37*4882a593Smuzhiyun  * uniform stream.
38*4882a593Smuzhiyun  *
39*4882a593Smuzhiyun  * Shader BO are immutable for their lifetimes (enforced by not
40*4882a593Smuzhiyun  * allowing mmaps, GEM prime export, or rendering to from a CL), so
41*4882a593Smuzhiyun  * this validation is only performed at BO creation time.
42*4882a593Smuzhiyun  */
43*4882a593Smuzhiyun 
44*4882a593Smuzhiyun #include "vc4_drv.h"
45*4882a593Smuzhiyun #include "vc4_qpu_defines.h"
46*4882a593Smuzhiyun 
47*4882a593Smuzhiyun #define LIVE_REG_COUNT (32 + 32 + 4)
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun struct vc4_shader_validation_state {
50*4882a593Smuzhiyun 	/* Current IP being validated. */
51*4882a593Smuzhiyun 	uint32_t ip;
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun 	/* IP at the end of the BO, do not read shader[max_ip] */
54*4882a593Smuzhiyun 	uint32_t max_ip;
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun 	uint64_t *shader;
57*4882a593Smuzhiyun 
58*4882a593Smuzhiyun 	struct vc4_texture_sample_info tmu_setup[2];
59*4882a593Smuzhiyun 	int tmu_write_count[2];
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun 	/* For registers that were last written to by a MIN instruction with
62*4882a593Smuzhiyun 	 * one argument being a uniform, the address of the uniform.
63*4882a593Smuzhiyun 	 * Otherwise, ~0.
64*4882a593Smuzhiyun 	 *
65*4882a593Smuzhiyun 	 * This is used for the validation of direct address memory reads.
66*4882a593Smuzhiyun 	 */
67*4882a593Smuzhiyun 	uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
68*4882a593Smuzhiyun 	bool live_max_clamp_regs[LIVE_REG_COUNT];
69*4882a593Smuzhiyun 	uint32_t live_immediates[LIVE_REG_COUNT];
70*4882a593Smuzhiyun 
71*4882a593Smuzhiyun 	/* Bitfield of which IPs are used as branch targets.
72*4882a593Smuzhiyun 	 *
73*4882a593Smuzhiyun 	 * Used for validation that the uniform stream is updated at the right
74*4882a593Smuzhiyun 	 * points and clearing the texturing/clamping state.
75*4882a593Smuzhiyun 	 */
76*4882a593Smuzhiyun 	unsigned long *branch_targets;
77*4882a593Smuzhiyun 
78*4882a593Smuzhiyun 	/* Set when entering a basic block, and cleared when the uniform
79*4882a593Smuzhiyun 	 * address update is found.  This is used to make sure that we don't
80*4882a593Smuzhiyun 	 * read uniforms when the address is undefined.
81*4882a593Smuzhiyun 	 */
82*4882a593Smuzhiyun 	bool needs_uniform_address_update;
83*4882a593Smuzhiyun 
84*4882a593Smuzhiyun 	/* Set when we find a backwards branch.  If the branch is backwards,
85*4882a593Smuzhiyun 	 * the taraget is probably doing an address reset to read uniforms,
86*4882a593Smuzhiyun 	 * and so we need to be sure that a uniforms address is present in the
87*4882a593Smuzhiyun 	 * stream, even if the shader didn't need to read uniforms in later
88*4882a593Smuzhiyun 	 * basic blocks.
89*4882a593Smuzhiyun 	 */
90*4882a593Smuzhiyun 	bool needs_uniform_address_for_loop;
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun 	/* Set when we find an instruction writing the top half of the
93*4882a593Smuzhiyun 	 * register files.  If we allowed writing the unusable regs in
94*4882a593Smuzhiyun 	 * a threaded shader, then the other shader running on our
95*4882a593Smuzhiyun 	 * QPU's clamp validation would be invalid.
96*4882a593Smuzhiyun 	 */
97*4882a593Smuzhiyun 	bool all_registers_used;
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun static uint32_t
waddr_to_live_reg_index(uint32_t waddr,bool is_b)101*4882a593Smuzhiyun waddr_to_live_reg_index(uint32_t waddr, bool is_b)
102*4882a593Smuzhiyun {
103*4882a593Smuzhiyun 	if (waddr < 32) {
104*4882a593Smuzhiyun 		if (is_b)
105*4882a593Smuzhiyun 			return 32 + waddr;
106*4882a593Smuzhiyun 		else
107*4882a593Smuzhiyun 			return waddr;
108*4882a593Smuzhiyun 	} else if (waddr <= QPU_W_ACC3) {
109*4882a593Smuzhiyun 		return 64 + waddr - QPU_W_ACC0;
110*4882a593Smuzhiyun 	} else {
111*4882a593Smuzhiyun 		return ~0;
112*4882a593Smuzhiyun 	}
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun static uint32_t
raddr_add_a_to_live_reg_index(uint64_t inst)116*4882a593Smuzhiyun raddr_add_a_to_live_reg_index(uint64_t inst)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
119*4882a593Smuzhiyun 	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
120*4882a593Smuzhiyun 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
121*4882a593Smuzhiyun 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	if (add_a == QPU_MUX_A)
124*4882a593Smuzhiyun 		return raddr_a;
125*4882a593Smuzhiyun 	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
126*4882a593Smuzhiyun 		return 32 + raddr_b;
127*4882a593Smuzhiyun 	else if (add_a <= QPU_MUX_R3)
128*4882a593Smuzhiyun 		return 64 + add_a;
129*4882a593Smuzhiyun 	else
130*4882a593Smuzhiyun 		return ~0;
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun 
133*4882a593Smuzhiyun static bool
live_reg_is_upper_half(uint32_t lri)134*4882a593Smuzhiyun live_reg_is_upper_half(uint32_t lri)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun 	return	(lri >= 16 && lri < 32) ||
137*4882a593Smuzhiyun 		(lri >= 32 + 16 && lri < 32 + 32);
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun 
140*4882a593Smuzhiyun static bool
is_tmu_submit(uint32_t waddr)141*4882a593Smuzhiyun is_tmu_submit(uint32_t waddr)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun 	return (waddr == QPU_W_TMU0_S ||
144*4882a593Smuzhiyun 		waddr == QPU_W_TMU1_S);
145*4882a593Smuzhiyun }
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun static bool
is_tmu_write(uint32_t waddr)148*4882a593Smuzhiyun is_tmu_write(uint32_t waddr)
149*4882a593Smuzhiyun {
150*4882a593Smuzhiyun 	return (waddr >= QPU_W_TMU0_S &&
151*4882a593Smuzhiyun 		waddr <= QPU_W_TMU1_B);
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun static bool
record_texture_sample(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int tmu)155*4882a593Smuzhiyun record_texture_sample(struct vc4_validated_shader_info *validated_shader,
156*4882a593Smuzhiyun 		      struct vc4_shader_validation_state *validation_state,
157*4882a593Smuzhiyun 		      int tmu)
158*4882a593Smuzhiyun {
159*4882a593Smuzhiyun 	uint32_t s = validated_shader->num_texture_samples;
160*4882a593Smuzhiyun 	int i;
161*4882a593Smuzhiyun 	struct vc4_texture_sample_info *temp_samples;
162*4882a593Smuzhiyun 
163*4882a593Smuzhiyun 	temp_samples = krealloc(validated_shader->texture_samples,
164*4882a593Smuzhiyun 				(s + 1) * sizeof(*temp_samples),
165*4882a593Smuzhiyun 				GFP_KERNEL);
166*4882a593Smuzhiyun 	if (!temp_samples)
167*4882a593Smuzhiyun 		return false;
168*4882a593Smuzhiyun 
169*4882a593Smuzhiyun 	memcpy(&temp_samples[s],
170*4882a593Smuzhiyun 	       &validation_state->tmu_setup[tmu],
171*4882a593Smuzhiyun 	       sizeof(*temp_samples));
172*4882a593Smuzhiyun 
173*4882a593Smuzhiyun 	validated_shader->num_texture_samples = s + 1;
174*4882a593Smuzhiyun 	validated_shader->texture_samples = temp_samples;
175*4882a593Smuzhiyun 
176*4882a593Smuzhiyun 	for (i = 0; i < 4; i++)
177*4882a593Smuzhiyun 		validation_state->tmu_setup[tmu].p_offset[i] = ~0;
178*4882a593Smuzhiyun 
179*4882a593Smuzhiyun 	return true;
180*4882a593Smuzhiyun }
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun static bool
check_tmu_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)183*4882a593Smuzhiyun check_tmu_write(struct vc4_validated_shader_info *validated_shader,
184*4882a593Smuzhiyun 		struct vc4_shader_validation_state *validation_state,
185*4882a593Smuzhiyun 		bool is_mul)
186*4882a593Smuzhiyun {
187*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
188*4882a593Smuzhiyun 	uint32_t waddr = (is_mul ?
189*4882a593Smuzhiyun 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
190*4882a593Smuzhiyun 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
191*4882a593Smuzhiyun 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
192*4882a593Smuzhiyun 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
193*4882a593Smuzhiyun 	int tmu = waddr > QPU_W_TMU0_B;
194*4882a593Smuzhiyun 	bool submit = is_tmu_submit(waddr);
195*4882a593Smuzhiyun 	bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
196*4882a593Smuzhiyun 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	if (is_direct) {
199*4882a593Smuzhiyun 		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
200*4882a593Smuzhiyun 		uint32_t clamp_reg, clamp_offset;
201*4882a593Smuzhiyun 
202*4882a593Smuzhiyun 		if (sig == QPU_SIG_SMALL_IMM) {
203*4882a593Smuzhiyun 			DRM_DEBUG("direct TMU read used small immediate\n");
204*4882a593Smuzhiyun 			return false;
205*4882a593Smuzhiyun 		}
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun 		/* Make sure that this texture load is an add of the base
208*4882a593Smuzhiyun 		 * address of the UBO to a clamped offset within the UBO.
209*4882a593Smuzhiyun 		 */
210*4882a593Smuzhiyun 		if (is_mul ||
211*4882a593Smuzhiyun 		    QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
212*4882a593Smuzhiyun 			DRM_DEBUG("direct TMU load wasn't an add\n");
213*4882a593Smuzhiyun 			return false;
214*4882a593Smuzhiyun 		}
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 		/* We assert that the clamped address is the first
217*4882a593Smuzhiyun 		 * argument, and the UBO base address is the second argument.
218*4882a593Smuzhiyun 		 * This is arbitrary, but simpler than supporting flipping the
219*4882a593Smuzhiyun 		 * two either way.
220*4882a593Smuzhiyun 		 */
221*4882a593Smuzhiyun 		clamp_reg = raddr_add_a_to_live_reg_index(inst);
222*4882a593Smuzhiyun 		if (clamp_reg == ~0) {
223*4882a593Smuzhiyun 			DRM_DEBUG("direct TMU load wasn't clamped\n");
224*4882a593Smuzhiyun 			return false;
225*4882a593Smuzhiyun 		}
226*4882a593Smuzhiyun 
227*4882a593Smuzhiyun 		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
228*4882a593Smuzhiyun 		if (clamp_offset == ~0) {
229*4882a593Smuzhiyun 			DRM_DEBUG("direct TMU load wasn't clamped\n");
230*4882a593Smuzhiyun 			return false;
231*4882a593Smuzhiyun 		}
232*4882a593Smuzhiyun 
233*4882a593Smuzhiyun 		/* Store the clamp value's offset in p1 (see reloc_tex() in
234*4882a593Smuzhiyun 		 * vc4_validate.c).
235*4882a593Smuzhiyun 		 */
236*4882a593Smuzhiyun 		validation_state->tmu_setup[tmu].p_offset[1] =
237*4882a593Smuzhiyun 			clamp_offset;
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun 		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
240*4882a593Smuzhiyun 		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
241*4882a593Smuzhiyun 			DRM_DEBUG("direct TMU load didn't add to a uniform\n");
242*4882a593Smuzhiyun 			return false;
243*4882a593Smuzhiyun 		}
244*4882a593Smuzhiyun 
245*4882a593Smuzhiyun 		validation_state->tmu_setup[tmu].is_direct = true;
246*4882a593Smuzhiyun 	} else {
247*4882a593Smuzhiyun 		if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
248*4882a593Smuzhiyun 					      raddr_b == QPU_R_UNIF)) {
249*4882a593Smuzhiyun 			DRM_DEBUG("uniform read in the same instruction as "
250*4882a593Smuzhiyun 				  "texture setup.\n");
251*4882a593Smuzhiyun 			return false;
252*4882a593Smuzhiyun 		}
253*4882a593Smuzhiyun 	}
254*4882a593Smuzhiyun 
255*4882a593Smuzhiyun 	if (validation_state->tmu_write_count[tmu] >= 4) {
256*4882a593Smuzhiyun 		DRM_DEBUG("TMU%d got too many parameters before dispatch\n",
257*4882a593Smuzhiyun 			  tmu);
258*4882a593Smuzhiyun 		return false;
259*4882a593Smuzhiyun 	}
260*4882a593Smuzhiyun 	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
261*4882a593Smuzhiyun 		validated_shader->uniforms_size;
262*4882a593Smuzhiyun 	validation_state->tmu_write_count[tmu]++;
263*4882a593Smuzhiyun 	/* Since direct uses a RADDR uniform reference, it will get counted in
264*4882a593Smuzhiyun 	 * check_instruction_reads()
265*4882a593Smuzhiyun 	 */
266*4882a593Smuzhiyun 	if (!is_direct) {
267*4882a593Smuzhiyun 		if (validation_state->needs_uniform_address_update) {
268*4882a593Smuzhiyun 			DRM_DEBUG("Texturing with undefined uniform address\n");
269*4882a593Smuzhiyun 			return false;
270*4882a593Smuzhiyun 		}
271*4882a593Smuzhiyun 
272*4882a593Smuzhiyun 		validated_shader->uniforms_size += 4;
273*4882a593Smuzhiyun 	}
274*4882a593Smuzhiyun 
275*4882a593Smuzhiyun 	if (submit) {
276*4882a593Smuzhiyun 		if (!record_texture_sample(validated_shader,
277*4882a593Smuzhiyun 					   validation_state, tmu)) {
278*4882a593Smuzhiyun 			return false;
279*4882a593Smuzhiyun 		}
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 		validation_state->tmu_write_count[tmu] = 0;
282*4882a593Smuzhiyun 	}
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun 	return true;
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun 
require_uniform_address_uniform(struct vc4_validated_shader_info * validated_shader)287*4882a593Smuzhiyun static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun 	uint32_t o = validated_shader->num_uniform_addr_offsets;
290*4882a593Smuzhiyun 	uint32_t num_uniforms = validated_shader->uniforms_size / 4;
291*4882a593Smuzhiyun 
292*4882a593Smuzhiyun 	validated_shader->uniform_addr_offsets =
293*4882a593Smuzhiyun 		krealloc(validated_shader->uniform_addr_offsets,
294*4882a593Smuzhiyun 			 (o + 1) *
295*4882a593Smuzhiyun 			 sizeof(*validated_shader->uniform_addr_offsets),
296*4882a593Smuzhiyun 			 GFP_KERNEL);
297*4882a593Smuzhiyun 	if (!validated_shader->uniform_addr_offsets)
298*4882a593Smuzhiyun 		return false;
299*4882a593Smuzhiyun 
300*4882a593Smuzhiyun 	validated_shader->uniform_addr_offsets[o] = num_uniforms;
301*4882a593Smuzhiyun 	validated_shader->num_uniform_addr_offsets++;
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun 	return true;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun 
306*4882a593Smuzhiyun static bool
validate_uniform_address_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)307*4882a593Smuzhiyun validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
308*4882a593Smuzhiyun 			       struct vc4_shader_validation_state *validation_state,
309*4882a593Smuzhiyun 			       bool is_mul)
310*4882a593Smuzhiyun {
311*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
312*4882a593Smuzhiyun 	u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
313*4882a593Smuzhiyun 	u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
314*4882a593Smuzhiyun 	u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
315*4882a593Smuzhiyun 	u32 add_lri = raddr_add_a_to_live_reg_index(inst);
316*4882a593Smuzhiyun 	/* We want our reset to be pointing at whatever uniform follows the
317*4882a593Smuzhiyun 	 * uniforms base address.
318*4882a593Smuzhiyun 	 */
319*4882a593Smuzhiyun 	u32 expected_offset = validated_shader->uniforms_size + 4;
320*4882a593Smuzhiyun 
321*4882a593Smuzhiyun 	/* We only support absolute uniform address changes, and we
322*4882a593Smuzhiyun 	 * require that they be in the current basic block before any
323*4882a593Smuzhiyun 	 * of its uniform reads.
324*4882a593Smuzhiyun 	 *
325*4882a593Smuzhiyun 	 * One could potentially emit more efficient QPU code, by
326*4882a593Smuzhiyun 	 * noticing that (say) an if statement does uniform control
327*4882a593Smuzhiyun 	 * flow for all threads and that the if reads the same number
328*4882a593Smuzhiyun 	 * of uniforms on each side.  However, this scheme is easy to
329*4882a593Smuzhiyun 	 * validate so it's all we allow for now.
330*4882a593Smuzhiyun 	 */
331*4882a593Smuzhiyun 	switch (QPU_GET_FIELD(inst, QPU_SIG)) {
332*4882a593Smuzhiyun 	case QPU_SIG_NONE:
333*4882a593Smuzhiyun 	case QPU_SIG_SCOREBOARD_UNLOCK:
334*4882a593Smuzhiyun 	case QPU_SIG_COLOR_LOAD:
335*4882a593Smuzhiyun 	case QPU_SIG_LOAD_TMU0:
336*4882a593Smuzhiyun 	case QPU_SIG_LOAD_TMU1:
337*4882a593Smuzhiyun 		break;
338*4882a593Smuzhiyun 	default:
339*4882a593Smuzhiyun 		DRM_DEBUG("uniforms address change must be "
340*4882a593Smuzhiyun 			  "normal math\n");
341*4882a593Smuzhiyun 		return false;
342*4882a593Smuzhiyun 	}
343*4882a593Smuzhiyun 
344*4882a593Smuzhiyun 	if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
345*4882a593Smuzhiyun 		DRM_DEBUG("Uniform address reset must be an ADD.\n");
346*4882a593Smuzhiyun 		return false;
347*4882a593Smuzhiyun 	}
348*4882a593Smuzhiyun 
349*4882a593Smuzhiyun 	if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
350*4882a593Smuzhiyun 		DRM_DEBUG("Uniform address reset must be unconditional.\n");
351*4882a593Smuzhiyun 		return false;
352*4882a593Smuzhiyun 	}
353*4882a593Smuzhiyun 
354*4882a593Smuzhiyun 	if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
355*4882a593Smuzhiyun 	    !(inst & QPU_PM)) {
356*4882a593Smuzhiyun 		DRM_DEBUG("No packing allowed on uniforms reset\n");
357*4882a593Smuzhiyun 		return false;
358*4882a593Smuzhiyun 	}
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun 	if (add_lri == -1) {
361*4882a593Smuzhiyun 		DRM_DEBUG("First argument of uniform address write must be "
362*4882a593Smuzhiyun 			  "an immediate value.\n");
363*4882a593Smuzhiyun 		return false;
364*4882a593Smuzhiyun 	}
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun 	if (validation_state->live_immediates[add_lri] != expected_offset) {
367*4882a593Smuzhiyun 		DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n",
368*4882a593Smuzhiyun 			  validation_state->live_immediates[add_lri],
369*4882a593Smuzhiyun 			  expected_offset);
370*4882a593Smuzhiyun 		return false;
371*4882a593Smuzhiyun 	}
372*4882a593Smuzhiyun 
373*4882a593Smuzhiyun 	if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
374*4882a593Smuzhiyun 	    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
375*4882a593Smuzhiyun 		DRM_DEBUG("Second argument of uniform address write must be "
376*4882a593Smuzhiyun 			  "a uniform.\n");
377*4882a593Smuzhiyun 		return false;
378*4882a593Smuzhiyun 	}
379*4882a593Smuzhiyun 
380*4882a593Smuzhiyun 	validation_state->needs_uniform_address_update = false;
381*4882a593Smuzhiyun 	validation_state->needs_uniform_address_for_loop = false;
382*4882a593Smuzhiyun 	return require_uniform_address_uniform(validated_shader);
383*4882a593Smuzhiyun }
384*4882a593Smuzhiyun 
385*4882a593Smuzhiyun static bool
check_reg_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)386*4882a593Smuzhiyun check_reg_write(struct vc4_validated_shader_info *validated_shader,
387*4882a593Smuzhiyun 		struct vc4_shader_validation_state *validation_state,
388*4882a593Smuzhiyun 		bool is_mul)
389*4882a593Smuzhiyun {
390*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
391*4882a593Smuzhiyun 	uint32_t waddr = (is_mul ?
392*4882a593Smuzhiyun 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
393*4882a593Smuzhiyun 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
394*4882a593Smuzhiyun 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
395*4882a593Smuzhiyun 	bool ws = inst & QPU_WS;
396*4882a593Smuzhiyun 	bool is_b = is_mul ^ ws;
397*4882a593Smuzhiyun 	u32 lri = waddr_to_live_reg_index(waddr, is_b);
398*4882a593Smuzhiyun 
399*4882a593Smuzhiyun 	if (lri != -1) {
400*4882a593Smuzhiyun 		uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
401*4882a593Smuzhiyun 		uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
402*4882a593Smuzhiyun 
403*4882a593Smuzhiyun 		if (sig == QPU_SIG_LOAD_IMM &&
404*4882a593Smuzhiyun 		    QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
405*4882a593Smuzhiyun 		    ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
406*4882a593Smuzhiyun 		     (!is_mul && cond_add == QPU_COND_ALWAYS))) {
407*4882a593Smuzhiyun 			validation_state->live_immediates[lri] =
408*4882a593Smuzhiyun 				QPU_GET_FIELD(inst, QPU_LOAD_IMM);
409*4882a593Smuzhiyun 		} else {
410*4882a593Smuzhiyun 			validation_state->live_immediates[lri] = ~0;
411*4882a593Smuzhiyun 		}
412*4882a593Smuzhiyun 
413*4882a593Smuzhiyun 		if (live_reg_is_upper_half(lri))
414*4882a593Smuzhiyun 			validation_state->all_registers_used = true;
415*4882a593Smuzhiyun 	}
416*4882a593Smuzhiyun 
417*4882a593Smuzhiyun 	switch (waddr) {
418*4882a593Smuzhiyun 	case QPU_W_UNIFORMS_ADDRESS:
419*4882a593Smuzhiyun 		if (is_b) {
420*4882a593Smuzhiyun 			DRM_DEBUG("relative uniforms address change "
421*4882a593Smuzhiyun 				  "unsupported\n");
422*4882a593Smuzhiyun 			return false;
423*4882a593Smuzhiyun 		}
424*4882a593Smuzhiyun 
425*4882a593Smuzhiyun 		return validate_uniform_address_write(validated_shader,
426*4882a593Smuzhiyun 						      validation_state,
427*4882a593Smuzhiyun 						      is_mul);
428*4882a593Smuzhiyun 
429*4882a593Smuzhiyun 	case QPU_W_TLB_COLOR_MS:
430*4882a593Smuzhiyun 	case QPU_W_TLB_COLOR_ALL:
431*4882a593Smuzhiyun 	case QPU_W_TLB_Z:
432*4882a593Smuzhiyun 		/* These only interact with the tile buffer, not main memory,
433*4882a593Smuzhiyun 		 * so they're safe.
434*4882a593Smuzhiyun 		 */
435*4882a593Smuzhiyun 		return true;
436*4882a593Smuzhiyun 
437*4882a593Smuzhiyun 	case QPU_W_TMU0_S:
438*4882a593Smuzhiyun 	case QPU_W_TMU0_T:
439*4882a593Smuzhiyun 	case QPU_W_TMU0_R:
440*4882a593Smuzhiyun 	case QPU_W_TMU0_B:
441*4882a593Smuzhiyun 	case QPU_W_TMU1_S:
442*4882a593Smuzhiyun 	case QPU_W_TMU1_T:
443*4882a593Smuzhiyun 	case QPU_W_TMU1_R:
444*4882a593Smuzhiyun 	case QPU_W_TMU1_B:
445*4882a593Smuzhiyun 		return check_tmu_write(validated_shader, validation_state,
446*4882a593Smuzhiyun 				       is_mul);
447*4882a593Smuzhiyun 
448*4882a593Smuzhiyun 	case QPU_W_HOST_INT:
449*4882a593Smuzhiyun 	case QPU_W_TMU_NOSWAP:
450*4882a593Smuzhiyun 	case QPU_W_TLB_ALPHA_MASK:
451*4882a593Smuzhiyun 	case QPU_W_MUTEX_RELEASE:
452*4882a593Smuzhiyun 		/* XXX: I haven't thought about these, so don't support them
453*4882a593Smuzhiyun 		 * for now.
454*4882a593Smuzhiyun 		 */
455*4882a593Smuzhiyun 		DRM_DEBUG("Unsupported waddr %d\n", waddr);
456*4882a593Smuzhiyun 		return false;
457*4882a593Smuzhiyun 
458*4882a593Smuzhiyun 	case QPU_W_VPM_ADDR:
459*4882a593Smuzhiyun 		DRM_DEBUG("General VPM DMA unsupported\n");
460*4882a593Smuzhiyun 		return false;
461*4882a593Smuzhiyun 
462*4882a593Smuzhiyun 	case QPU_W_VPM:
463*4882a593Smuzhiyun 	case QPU_W_VPMVCD_SETUP:
464*4882a593Smuzhiyun 		/* We allow VPM setup in general, even including VPM DMA
465*4882a593Smuzhiyun 		 * configuration setup, because the (unsafe) DMA can only be
466*4882a593Smuzhiyun 		 * triggered by QPU_W_VPM_ADDR writes.
467*4882a593Smuzhiyun 		 */
468*4882a593Smuzhiyun 		return true;
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun 	case QPU_W_TLB_STENCIL_SETUP:
471*4882a593Smuzhiyun 		return true;
472*4882a593Smuzhiyun 	}
473*4882a593Smuzhiyun 
474*4882a593Smuzhiyun 	return true;
475*4882a593Smuzhiyun }
476*4882a593Smuzhiyun 
477*4882a593Smuzhiyun static void
track_live_clamps(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)478*4882a593Smuzhiyun track_live_clamps(struct vc4_validated_shader_info *validated_shader,
479*4882a593Smuzhiyun 		  struct vc4_shader_validation_state *validation_state)
480*4882a593Smuzhiyun {
481*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
482*4882a593Smuzhiyun 	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
483*4882a593Smuzhiyun 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
484*4882a593Smuzhiyun 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
485*4882a593Smuzhiyun 	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
486*4882a593Smuzhiyun 	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
487*4882a593Smuzhiyun 	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
488*4882a593Smuzhiyun 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
489*4882a593Smuzhiyun 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
490*4882a593Smuzhiyun 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
491*4882a593Smuzhiyun 	bool ws = inst & QPU_WS;
492*4882a593Smuzhiyun 	uint32_t lri_add_a, lri_add, lri_mul;
493*4882a593Smuzhiyun 	bool add_a_is_min_0;
494*4882a593Smuzhiyun 
495*4882a593Smuzhiyun 	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
496*4882a593Smuzhiyun 	 * before we clear previous live state.
497*4882a593Smuzhiyun 	 */
498*4882a593Smuzhiyun 	lri_add_a = raddr_add_a_to_live_reg_index(inst);
499*4882a593Smuzhiyun 	add_a_is_min_0 = (lri_add_a != ~0 &&
500*4882a593Smuzhiyun 			  validation_state->live_max_clamp_regs[lri_add_a]);
501*4882a593Smuzhiyun 
502*4882a593Smuzhiyun 	/* Clear live state for registers written by our instruction. */
503*4882a593Smuzhiyun 	lri_add = waddr_to_live_reg_index(waddr_add, ws);
504*4882a593Smuzhiyun 	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
505*4882a593Smuzhiyun 	if (lri_mul != ~0) {
506*4882a593Smuzhiyun 		validation_state->live_max_clamp_regs[lri_mul] = false;
507*4882a593Smuzhiyun 		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
508*4882a593Smuzhiyun 	}
509*4882a593Smuzhiyun 	if (lri_add != ~0) {
510*4882a593Smuzhiyun 		validation_state->live_max_clamp_regs[lri_add] = false;
511*4882a593Smuzhiyun 		validation_state->live_min_clamp_offsets[lri_add] = ~0;
512*4882a593Smuzhiyun 	} else {
513*4882a593Smuzhiyun 		/* Nothing further to do for live tracking, since only ADDs
514*4882a593Smuzhiyun 		 * generate new live clamp registers.
515*4882a593Smuzhiyun 		 */
516*4882a593Smuzhiyun 		return;
517*4882a593Smuzhiyun 	}
518*4882a593Smuzhiyun 
519*4882a593Smuzhiyun 	/* Now, handle remaining live clamp tracking for the ADD operation. */
520*4882a593Smuzhiyun 
521*4882a593Smuzhiyun 	if (cond_add != QPU_COND_ALWAYS)
522*4882a593Smuzhiyun 		return;
523*4882a593Smuzhiyun 
524*4882a593Smuzhiyun 	if (op_add == QPU_A_MAX) {
525*4882a593Smuzhiyun 		/* Track live clamps of a value to a minimum of 0 (in either
526*4882a593Smuzhiyun 		 * arg).
527*4882a593Smuzhiyun 		 */
528*4882a593Smuzhiyun 		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
529*4882a593Smuzhiyun 		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
530*4882a593Smuzhiyun 			return;
531*4882a593Smuzhiyun 		}
532*4882a593Smuzhiyun 
533*4882a593Smuzhiyun 		validation_state->live_max_clamp_regs[lri_add] = true;
534*4882a593Smuzhiyun 	} else if (op_add == QPU_A_MIN) {
535*4882a593Smuzhiyun 		/* Track live clamps of a value clamped to a minimum of 0 and
536*4882a593Smuzhiyun 		 * a maximum of some uniform's offset.
537*4882a593Smuzhiyun 		 */
538*4882a593Smuzhiyun 		if (!add_a_is_min_0)
539*4882a593Smuzhiyun 			return;
540*4882a593Smuzhiyun 
541*4882a593Smuzhiyun 		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
542*4882a593Smuzhiyun 		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
543*4882a593Smuzhiyun 		      sig != QPU_SIG_SMALL_IMM)) {
544*4882a593Smuzhiyun 			return;
545*4882a593Smuzhiyun 		}
546*4882a593Smuzhiyun 
547*4882a593Smuzhiyun 		validation_state->live_min_clamp_offsets[lri_add] =
548*4882a593Smuzhiyun 			validated_shader->uniforms_size;
549*4882a593Smuzhiyun 	}
550*4882a593Smuzhiyun }
551*4882a593Smuzhiyun 
552*4882a593Smuzhiyun static bool
check_instruction_writes(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)553*4882a593Smuzhiyun check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
554*4882a593Smuzhiyun 			 struct vc4_shader_validation_state *validation_state)
555*4882a593Smuzhiyun {
556*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
557*4882a593Smuzhiyun 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
558*4882a593Smuzhiyun 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
559*4882a593Smuzhiyun 	bool ok;
560*4882a593Smuzhiyun 
561*4882a593Smuzhiyun 	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
562*4882a593Smuzhiyun 		DRM_DEBUG("ADD and MUL both set up textures\n");
563*4882a593Smuzhiyun 		return false;
564*4882a593Smuzhiyun 	}
565*4882a593Smuzhiyun 
566*4882a593Smuzhiyun 	ok = (check_reg_write(validated_shader, validation_state, false) &&
567*4882a593Smuzhiyun 	      check_reg_write(validated_shader, validation_state, true));
568*4882a593Smuzhiyun 
569*4882a593Smuzhiyun 	track_live_clamps(validated_shader, validation_state);
570*4882a593Smuzhiyun 
571*4882a593Smuzhiyun 	return ok;
572*4882a593Smuzhiyun }
573*4882a593Smuzhiyun 
574*4882a593Smuzhiyun static bool
check_branch(uint64_t inst,struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int ip)575*4882a593Smuzhiyun check_branch(uint64_t inst,
576*4882a593Smuzhiyun 	     struct vc4_validated_shader_info *validated_shader,
577*4882a593Smuzhiyun 	     struct vc4_shader_validation_state *validation_state,
578*4882a593Smuzhiyun 	     int ip)
579*4882a593Smuzhiyun {
580*4882a593Smuzhiyun 	int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
581*4882a593Smuzhiyun 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
582*4882a593Smuzhiyun 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
583*4882a593Smuzhiyun 
584*4882a593Smuzhiyun 	if ((int)branch_imm < 0)
585*4882a593Smuzhiyun 		validation_state->needs_uniform_address_for_loop = true;
586*4882a593Smuzhiyun 
587*4882a593Smuzhiyun 	/* We don't want to have to worry about validation of this, and
588*4882a593Smuzhiyun 	 * there's no need for it.
589*4882a593Smuzhiyun 	 */
590*4882a593Smuzhiyun 	if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
591*4882a593Smuzhiyun 		DRM_DEBUG("branch instruction at %d wrote a register.\n",
592*4882a593Smuzhiyun 			  validation_state->ip);
593*4882a593Smuzhiyun 		return false;
594*4882a593Smuzhiyun 	}
595*4882a593Smuzhiyun 
596*4882a593Smuzhiyun 	return true;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun 
599*4882a593Smuzhiyun static bool
check_instruction_reads(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)600*4882a593Smuzhiyun check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
601*4882a593Smuzhiyun 			struct vc4_shader_validation_state *validation_state)
602*4882a593Smuzhiyun {
603*4882a593Smuzhiyun 	uint64_t inst = validation_state->shader[validation_state->ip];
604*4882a593Smuzhiyun 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
605*4882a593Smuzhiyun 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
606*4882a593Smuzhiyun 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
607*4882a593Smuzhiyun 
608*4882a593Smuzhiyun 	if (raddr_a == QPU_R_UNIF ||
609*4882a593Smuzhiyun 	    (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
610*4882a593Smuzhiyun 		/* This can't overflow the uint32_t, because we're reading 8
611*4882a593Smuzhiyun 		 * bytes of instruction to increment by 4 here, so we'd
612*4882a593Smuzhiyun 		 * already be OOM.
613*4882a593Smuzhiyun 		 */
614*4882a593Smuzhiyun 		validated_shader->uniforms_size += 4;
615*4882a593Smuzhiyun 
616*4882a593Smuzhiyun 		if (validation_state->needs_uniform_address_update) {
617*4882a593Smuzhiyun 			DRM_DEBUG("Uniform read with undefined uniform "
618*4882a593Smuzhiyun 				  "address\n");
619*4882a593Smuzhiyun 			return false;
620*4882a593Smuzhiyun 		}
621*4882a593Smuzhiyun 	}
622*4882a593Smuzhiyun 
623*4882a593Smuzhiyun 	if ((raddr_a >= 16 && raddr_a < 32) ||
624*4882a593Smuzhiyun 	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
625*4882a593Smuzhiyun 		validation_state->all_registers_used = true;
626*4882a593Smuzhiyun 	}
627*4882a593Smuzhiyun 
628*4882a593Smuzhiyun 	return true;
629*4882a593Smuzhiyun }
630*4882a593Smuzhiyun 
631*4882a593Smuzhiyun /* Make sure that all branches are absolute and point within the shader, and
632*4882a593Smuzhiyun  * note their targets for later.
633*4882a593Smuzhiyun  */
634*4882a593Smuzhiyun static bool
vc4_validate_branches(struct vc4_shader_validation_state * validation_state)635*4882a593Smuzhiyun vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
636*4882a593Smuzhiyun {
637*4882a593Smuzhiyun 	uint32_t max_branch_target = 0;
638*4882a593Smuzhiyun 	int ip;
639*4882a593Smuzhiyun 	int last_branch = -2;
640*4882a593Smuzhiyun 
641*4882a593Smuzhiyun 	for (ip = 0; ip < validation_state->max_ip; ip++) {
642*4882a593Smuzhiyun 		uint64_t inst = validation_state->shader[ip];
643*4882a593Smuzhiyun 		int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
644*4882a593Smuzhiyun 		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
645*4882a593Smuzhiyun 		uint32_t after_delay_ip = ip + 4;
646*4882a593Smuzhiyun 		uint32_t branch_target_ip;
647*4882a593Smuzhiyun 
648*4882a593Smuzhiyun 		if (sig == QPU_SIG_PROG_END) {
649*4882a593Smuzhiyun 			/* There are two delay slots after program end is
650*4882a593Smuzhiyun 			 * signaled that are still executed, then we're
651*4882a593Smuzhiyun 			 * finished.  validation_state->max_ip is the
652*4882a593Smuzhiyun 			 * instruction after the last valid instruction in the
653*4882a593Smuzhiyun 			 * program.
654*4882a593Smuzhiyun 			 */
655*4882a593Smuzhiyun 			validation_state->max_ip = ip + 3;
656*4882a593Smuzhiyun 			continue;
657*4882a593Smuzhiyun 		}
658*4882a593Smuzhiyun 
659*4882a593Smuzhiyun 		if (sig != QPU_SIG_BRANCH)
660*4882a593Smuzhiyun 			continue;
661*4882a593Smuzhiyun 
662*4882a593Smuzhiyun 		if (ip - last_branch < 4) {
663*4882a593Smuzhiyun 			DRM_DEBUG("Branch at %d during delay slots\n", ip);
664*4882a593Smuzhiyun 			return false;
665*4882a593Smuzhiyun 		}
666*4882a593Smuzhiyun 		last_branch = ip;
667*4882a593Smuzhiyun 
668*4882a593Smuzhiyun 		if (inst & QPU_BRANCH_REG) {
669*4882a593Smuzhiyun 			DRM_DEBUG("branching from register relative "
670*4882a593Smuzhiyun 				  "not supported\n");
671*4882a593Smuzhiyun 			return false;
672*4882a593Smuzhiyun 		}
673*4882a593Smuzhiyun 
674*4882a593Smuzhiyun 		if (!(inst & QPU_BRANCH_REL)) {
675*4882a593Smuzhiyun 			DRM_DEBUG("relative branching required\n");
676*4882a593Smuzhiyun 			return false;
677*4882a593Smuzhiyun 		}
678*4882a593Smuzhiyun 
679*4882a593Smuzhiyun 		/* The actual branch target is the instruction after the delay
680*4882a593Smuzhiyun 		 * slots, plus whatever byte offset is in the low 32 bits of
681*4882a593Smuzhiyun 		 * the instruction.  Make sure we're not branching beyond the
682*4882a593Smuzhiyun 		 * end of the shader object.
683*4882a593Smuzhiyun 		 */
684*4882a593Smuzhiyun 		if (branch_imm % sizeof(inst) != 0) {
685*4882a593Smuzhiyun 			DRM_DEBUG("branch target not aligned\n");
686*4882a593Smuzhiyun 			return false;
687*4882a593Smuzhiyun 		}
688*4882a593Smuzhiyun 
689*4882a593Smuzhiyun 		branch_target_ip = after_delay_ip + (branch_imm >> 3);
690*4882a593Smuzhiyun 		if (branch_target_ip >= validation_state->max_ip) {
691*4882a593Smuzhiyun 			DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n",
692*4882a593Smuzhiyun 				  ip, branch_target_ip,
693*4882a593Smuzhiyun 				  validation_state->max_ip);
694*4882a593Smuzhiyun 			return false;
695*4882a593Smuzhiyun 		}
696*4882a593Smuzhiyun 		set_bit(branch_target_ip, validation_state->branch_targets);
697*4882a593Smuzhiyun 
698*4882a593Smuzhiyun 		/* Make sure that the non-branching path is also not outside
699*4882a593Smuzhiyun 		 * the shader.
700*4882a593Smuzhiyun 		 */
701*4882a593Smuzhiyun 		if (after_delay_ip >= validation_state->max_ip) {
702*4882a593Smuzhiyun 			DRM_DEBUG("Branch at %d continues past shader end "
703*4882a593Smuzhiyun 				  "(%d/%d)\n",
704*4882a593Smuzhiyun 				  ip, after_delay_ip, validation_state->max_ip);
705*4882a593Smuzhiyun 			return false;
706*4882a593Smuzhiyun 		}
707*4882a593Smuzhiyun 		set_bit(after_delay_ip, validation_state->branch_targets);
708*4882a593Smuzhiyun 		max_branch_target = max(max_branch_target, after_delay_ip);
709*4882a593Smuzhiyun 	}
710*4882a593Smuzhiyun 
711*4882a593Smuzhiyun 	if (max_branch_target > validation_state->max_ip - 3) {
712*4882a593Smuzhiyun 		DRM_DEBUG("Branch landed after QPU_SIG_PROG_END");
713*4882a593Smuzhiyun 		return false;
714*4882a593Smuzhiyun 	}
715*4882a593Smuzhiyun 
716*4882a593Smuzhiyun 	return true;
717*4882a593Smuzhiyun }
718*4882a593Smuzhiyun 
719*4882a593Smuzhiyun /* Resets any known state for the shader, used when we may be branched to from
720*4882a593Smuzhiyun  * multiple locations in the program (or at shader start).
721*4882a593Smuzhiyun  */
722*4882a593Smuzhiyun static void
reset_validation_state(struct vc4_shader_validation_state * validation_state)723*4882a593Smuzhiyun reset_validation_state(struct vc4_shader_validation_state *validation_state)
724*4882a593Smuzhiyun {
725*4882a593Smuzhiyun 	int i;
726*4882a593Smuzhiyun 
727*4882a593Smuzhiyun 	for (i = 0; i < 8; i++)
728*4882a593Smuzhiyun 		validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
729*4882a593Smuzhiyun 
730*4882a593Smuzhiyun 	for (i = 0; i < LIVE_REG_COUNT; i++) {
731*4882a593Smuzhiyun 		validation_state->live_min_clamp_offsets[i] = ~0;
732*4882a593Smuzhiyun 		validation_state->live_max_clamp_regs[i] = false;
733*4882a593Smuzhiyun 		validation_state->live_immediates[i] = ~0;
734*4882a593Smuzhiyun 	}
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun 
737*4882a593Smuzhiyun static bool
texturing_in_progress(struct vc4_shader_validation_state * validation_state)738*4882a593Smuzhiyun texturing_in_progress(struct vc4_shader_validation_state *validation_state)
739*4882a593Smuzhiyun {
740*4882a593Smuzhiyun 	return (validation_state->tmu_write_count[0] != 0 ||
741*4882a593Smuzhiyun 		validation_state->tmu_write_count[1] != 0);
742*4882a593Smuzhiyun }
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun static bool
vc4_handle_branch_target(struct vc4_shader_validation_state * validation_state)745*4882a593Smuzhiyun vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
746*4882a593Smuzhiyun {
747*4882a593Smuzhiyun 	uint32_t ip = validation_state->ip;
748*4882a593Smuzhiyun 
749*4882a593Smuzhiyun 	if (!test_bit(ip, validation_state->branch_targets))
750*4882a593Smuzhiyun 		return true;
751*4882a593Smuzhiyun 
752*4882a593Smuzhiyun 	if (texturing_in_progress(validation_state)) {
753*4882a593Smuzhiyun 		DRM_DEBUG("Branch target landed during TMU setup\n");
754*4882a593Smuzhiyun 		return false;
755*4882a593Smuzhiyun 	}
756*4882a593Smuzhiyun 
757*4882a593Smuzhiyun 	/* Reset our live values tracking, since this instruction may have
758*4882a593Smuzhiyun 	 * multiple predecessors.
759*4882a593Smuzhiyun 	 *
760*4882a593Smuzhiyun 	 * One could potentially do analysis to determine that, for
761*4882a593Smuzhiyun 	 * example, all predecessors have a live max clamp in the same
762*4882a593Smuzhiyun 	 * register, but we don't bother with that.
763*4882a593Smuzhiyun 	 */
764*4882a593Smuzhiyun 	reset_validation_state(validation_state);
765*4882a593Smuzhiyun 
766*4882a593Smuzhiyun 	/* Since we've entered a basic block from potentially multiple
767*4882a593Smuzhiyun 	 * predecessors, we need the uniforms address to be updated before any
768*4882a593Smuzhiyun 	 * unforms are read.  We require that after any branch point, the next
769*4882a593Smuzhiyun 	 * uniform to be loaded is a uniform address offset.  That uniform's
770*4882a593Smuzhiyun 	 * offset will be marked by the uniform address register write
771*4882a593Smuzhiyun 	 * validation, or a one-off the end-of-program check.
772*4882a593Smuzhiyun 	 */
773*4882a593Smuzhiyun 	validation_state->needs_uniform_address_update = true;
774*4882a593Smuzhiyun 
775*4882a593Smuzhiyun 	return true;
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_cma_object * shader_obj)779*4882a593Smuzhiyun vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
780*4882a593Smuzhiyun {
781*4882a593Smuzhiyun 	bool found_shader_end = false;
782*4882a593Smuzhiyun 	int shader_end_ip = 0;
783*4882a593Smuzhiyun 	uint32_t last_thread_switch_ip = -3;
784*4882a593Smuzhiyun 	uint32_t ip;
785*4882a593Smuzhiyun 	struct vc4_validated_shader_info *validated_shader = NULL;
786*4882a593Smuzhiyun 	struct vc4_shader_validation_state validation_state;
787*4882a593Smuzhiyun 
788*4882a593Smuzhiyun 	memset(&validation_state, 0, sizeof(validation_state));
789*4882a593Smuzhiyun 	validation_state.shader = shader_obj->vaddr;
790*4882a593Smuzhiyun 	validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
791*4882a593Smuzhiyun 
792*4882a593Smuzhiyun 	reset_validation_state(&validation_state);
793*4882a593Smuzhiyun 
794*4882a593Smuzhiyun 	validation_state.branch_targets =
795*4882a593Smuzhiyun 		kcalloc(BITS_TO_LONGS(validation_state.max_ip),
796*4882a593Smuzhiyun 			sizeof(unsigned long), GFP_KERNEL);
797*4882a593Smuzhiyun 	if (!validation_state.branch_targets)
798*4882a593Smuzhiyun 		goto fail;
799*4882a593Smuzhiyun 
800*4882a593Smuzhiyun 	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
801*4882a593Smuzhiyun 	if (!validated_shader)
802*4882a593Smuzhiyun 		goto fail;
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun 	if (!vc4_validate_branches(&validation_state))
805*4882a593Smuzhiyun 		goto fail;
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	for (ip = 0; ip < validation_state.max_ip; ip++) {
808*4882a593Smuzhiyun 		uint64_t inst = validation_state.shader[ip];
809*4882a593Smuzhiyun 		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
810*4882a593Smuzhiyun 
811*4882a593Smuzhiyun 		validation_state.ip = ip;
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun 		if (!vc4_handle_branch_target(&validation_state))
814*4882a593Smuzhiyun 			goto fail;
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 		if (ip == last_thread_switch_ip + 3) {
817*4882a593Smuzhiyun 			/* Reset r0-r3 live clamp data */
818*4882a593Smuzhiyun 			int i;
819*4882a593Smuzhiyun 
820*4882a593Smuzhiyun 			for (i = 64; i < LIVE_REG_COUNT; i++) {
821*4882a593Smuzhiyun 				validation_state.live_min_clamp_offsets[i] = ~0;
822*4882a593Smuzhiyun 				validation_state.live_max_clamp_regs[i] = false;
823*4882a593Smuzhiyun 				validation_state.live_immediates[i] = ~0;
824*4882a593Smuzhiyun 			}
825*4882a593Smuzhiyun 		}
826*4882a593Smuzhiyun 
827*4882a593Smuzhiyun 		switch (sig) {
828*4882a593Smuzhiyun 		case QPU_SIG_NONE:
829*4882a593Smuzhiyun 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
830*4882a593Smuzhiyun 		case QPU_SIG_SCOREBOARD_UNLOCK:
831*4882a593Smuzhiyun 		case QPU_SIG_COLOR_LOAD:
832*4882a593Smuzhiyun 		case QPU_SIG_LOAD_TMU0:
833*4882a593Smuzhiyun 		case QPU_SIG_LOAD_TMU1:
834*4882a593Smuzhiyun 		case QPU_SIG_PROG_END:
835*4882a593Smuzhiyun 		case QPU_SIG_SMALL_IMM:
836*4882a593Smuzhiyun 		case QPU_SIG_THREAD_SWITCH:
837*4882a593Smuzhiyun 		case QPU_SIG_LAST_THREAD_SWITCH:
838*4882a593Smuzhiyun 			if (!check_instruction_writes(validated_shader,
839*4882a593Smuzhiyun 						      &validation_state)) {
840*4882a593Smuzhiyun 				DRM_DEBUG("Bad write at ip %d\n", ip);
841*4882a593Smuzhiyun 				goto fail;
842*4882a593Smuzhiyun 			}
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun 			if (!check_instruction_reads(validated_shader,
845*4882a593Smuzhiyun 						     &validation_state))
846*4882a593Smuzhiyun 				goto fail;
847*4882a593Smuzhiyun 
848*4882a593Smuzhiyun 			if (sig == QPU_SIG_PROG_END) {
849*4882a593Smuzhiyun 				found_shader_end = true;
850*4882a593Smuzhiyun 				shader_end_ip = ip;
851*4882a593Smuzhiyun 			}
852*4882a593Smuzhiyun 
853*4882a593Smuzhiyun 			if (sig == QPU_SIG_THREAD_SWITCH ||
854*4882a593Smuzhiyun 			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
855*4882a593Smuzhiyun 				validated_shader->is_threaded = true;
856*4882a593Smuzhiyun 
857*4882a593Smuzhiyun 				if (ip < last_thread_switch_ip + 3) {
858*4882a593Smuzhiyun 					DRM_DEBUG("Thread switch too soon after "
859*4882a593Smuzhiyun 						  "last switch at ip %d\n", ip);
860*4882a593Smuzhiyun 					goto fail;
861*4882a593Smuzhiyun 				}
862*4882a593Smuzhiyun 				last_thread_switch_ip = ip;
863*4882a593Smuzhiyun 			}
864*4882a593Smuzhiyun 
865*4882a593Smuzhiyun 			break;
866*4882a593Smuzhiyun 
867*4882a593Smuzhiyun 		case QPU_SIG_LOAD_IMM:
868*4882a593Smuzhiyun 			if (!check_instruction_writes(validated_shader,
869*4882a593Smuzhiyun 						      &validation_state)) {
870*4882a593Smuzhiyun 				DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip);
871*4882a593Smuzhiyun 				goto fail;
872*4882a593Smuzhiyun 			}
873*4882a593Smuzhiyun 			break;
874*4882a593Smuzhiyun 
875*4882a593Smuzhiyun 		case QPU_SIG_BRANCH:
876*4882a593Smuzhiyun 			if (!check_branch(inst, validated_shader,
877*4882a593Smuzhiyun 					  &validation_state, ip))
878*4882a593Smuzhiyun 				goto fail;
879*4882a593Smuzhiyun 
880*4882a593Smuzhiyun 			if (ip < last_thread_switch_ip + 3) {
881*4882a593Smuzhiyun 				DRM_DEBUG("Branch in thread switch at ip %d",
882*4882a593Smuzhiyun 					  ip);
883*4882a593Smuzhiyun 				goto fail;
884*4882a593Smuzhiyun 			}
885*4882a593Smuzhiyun 
886*4882a593Smuzhiyun 			break;
887*4882a593Smuzhiyun 		default:
888*4882a593Smuzhiyun 			DRM_DEBUG("Unsupported QPU signal %d at "
889*4882a593Smuzhiyun 				  "instruction %d\n", sig, ip);
890*4882a593Smuzhiyun 			goto fail;
891*4882a593Smuzhiyun 		}
892*4882a593Smuzhiyun 
893*4882a593Smuzhiyun 		/* There are two delay slots after program end is signaled
894*4882a593Smuzhiyun 		 * that are still executed, then we're finished.
895*4882a593Smuzhiyun 		 */
896*4882a593Smuzhiyun 		if (found_shader_end && ip == shader_end_ip + 2)
897*4882a593Smuzhiyun 			break;
898*4882a593Smuzhiyun 	}
899*4882a593Smuzhiyun 
900*4882a593Smuzhiyun 	if (ip == validation_state.max_ip) {
901*4882a593Smuzhiyun 		DRM_DEBUG("shader failed to terminate before "
902*4882a593Smuzhiyun 			  "shader BO end at %zd\n",
903*4882a593Smuzhiyun 			  shader_obj->base.size);
904*4882a593Smuzhiyun 		goto fail;
905*4882a593Smuzhiyun 	}
906*4882a593Smuzhiyun 
907*4882a593Smuzhiyun 	/* Might corrupt other thread */
908*4882a593Smuzhiyun 	if (validated_shader->is_threaded &&
909*4882a593Smuzhiyun 	    validation_state.all_registers_used) {
910*4882a593Smuzhiyun 		DRM_DEBUG("Shader uses threading, but uses the upper "
911*4882a593Smuzhiyun 			  "half of the registers, too\n");
912*4882a593Smuzhiyun 		goto fail;
913*4882a593Smuzhiyun 	}
914*4882a593Smuzhiyun 
915*4882a593Smuzhiyun 	/* If we did a backwards branch and we haven't emitted a uniforms
916*4882a593Smuzhiyun 	 * reset since then, we still need the uniforms stream to have the
917*4882a593Smuzhiyun 	 * uniforms address available so that the backwards branch can do its
918*4882a593Smuzhiyun 	 * uniforms reset.
919*4882a593Smuzhiyun 	 *
920*4882a593Smuzhiyun 	 * We could potentially prove that the backwards branch doesn't
921*4882a593Smuzhiyun 	 * contain any uses of uniforms until program exit, but that doesn't
922*4882a593Smuzhiyun 	 * seem to be worth the trouble.
923*4882a593Smuzhiyun 	 */
924*4882a593Smuzhiyun 	if (validation_state.needs_uniform_address_for_loop) {
925*4882a593Smuzhiyun 		if (!require_uniform_address_uniform(validated_shader))
926*4882a593Smuzhiyun 			goto fail;
927*4882a593Smuzhiyun 		validated_shader->uniforms_size += 4;
928*4882a593Smuzhiyun 	}
929*4882a593Smuzhiyun 
930*4882a593Smuzhiyun 	/* Again, no chance of integer overflow here because the worst case
931*4882a593Smuzhiyun 	 * scenario is 8 bytes of uniforms plus handles per 8-byte
932*4882a593Smuzhiyun 	 * instruction.
933*4882a593Smuzhiyun 	 */
934*4882a593Smuzhiyun 	validated_shader->uniforms_src_size =
935*4882a593Smuzhiyun 		(validated_shader->uniforms_size +
936*4882a593Smuzhiyun 		 4 * validated_shader->num_texture_samples);
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun 	kfree(validation_state.branch_targets);
939*4882a593Smuzhiyun 
940*4882a593Smuzhiyun 	return validated_shader;
941*4882a593Smuzhiyun 
942*4882a593Smuzhiyun fail:
943*4882a593Smuzhiyun 	kfree(validation_state.branch_targets);
944*4882a593Smuzhiyun 	if (validated_shader) {
945*4882a593Smuzhiyun 		kfree(validated_shader->uniform_addr_offsets);
946*4882a593Smuzhiyun 		kfree(validated_shader->texture_samples);
947*4882a593Smuzhiyun 		kfree(validated_shader);
948*4882a593Smuzhiyun 	}
949*4882a593Smuzhiyun 	return NULL;
950*4882a593Smuzhiyun }
951