1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright © 2014 Broadcom
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Permission is hereby granted, free of charge, to any person obtaining a
5*4882a593Smuzhiyun * copy of this software and associated documentation files (the "Software"),
6*4882a593Smuzhiyun * to deal in the Software without restriction, including without limitation
7*4882a593Smuzhiyun * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8*4882a593Smuzhiyun * and/or sell copies of the Software, and to permit persons to whom the
9*4882a593Smuzhiyun * Software is furnished to do so, subject to the following conditions:
10*4882a593Smuzhiyun *
11*4882a593Smuzhiyun * The above copyright notice and this permission notice (including the next
12*4882a593Smuzhiyun * paragraph) shall be included in all copies or substantial portions of the
13*4882a593Smuzhiyun * Software.
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16*4882a593Smuzhiyun * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17*4882a593Smuzhiyun * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18*4882a593Smuzhiyun * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19*4882a593Smuzhiyun * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20*4882a593Smuzhiyun * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21*4882a593Smuzhiyun * IN THE SOFTWARE.
22*4882a593Smuzhiyun */
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun /**
25*4882a593Smuzhiyun * DOC: Shader validator for VC4.
26*4882a593Smuzhiyun *
27*4882a593Smuzhiyun * Since the VC4 has no IOMMU between it and system memory, a user
28*4882a593Smuzhiyun * with access to execute shaders could escalate privilege by
29*4882a593Smuzhiyun * overwriting system memory (using the VPM write address register in
30*4882a593Smuzhiyun * the general-purpose DMA mode) or reading system memory it shouldn't
31*4882a593Smuzhiyun * (reading it as a texture, uniform data, or direct-addressed TMU
32*4882a593Smuzhiyun * lookup).
33*4882a593Smuzhiyun *
34*4882a593Smuzhiyun * The shader validator walks over a shader's BO, ensuring that its
35*4882a593Smuzhiyun * accesses are appropriately bounded, and recording where texture
36*4882a593Smuzhiyun * accesses are made so that we can do relocations for them in the
37*4882a593Smuzhiyun * uniform stream.
38*4882a593Smuzhiyun *
39*4882a593Smuzhiyun * Shader BO are immutable for their lifetimes (enforced by not
40*4882a593Smuzhiyun * allowing mmaps, GEM prime export, or rendering to from a CL), so
41*4882a593Smuzhiyun * this validation is only performed at BO creation time.
42*4882a593Smuzhiyun */
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun #include "vc4_drv.h"
45*4882a593Smuzhiyun #include "vc4_qpu_defines.h"
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun #define LIVE_REG_COUNT (32 + 32 + 4)
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun struct vc4_shader_validation_state {
50*4882a593Smuzhiyun /* Current IP being validated. */
51*4882a593Smuzhiyun uint32_t ip;
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun /* IP at the end of the BO, do not read shader[max_ip] */
54*4882a593Smuzhiyun uint32_t max_ip;
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun uint64_t *shader;
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun struct vc4_texture_sample_info tmu_setup[2];
59*4882a593Smuzhiyun int tmu_write_count[2];
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun /* For registers that were last written to by a MIN instruction with
62*4882a593Smuzhiyun * one argument being a uniform, the address of the uniform.
63*4882a593Smuzhiyun * Otherwise, ~0.
64*4882a593Smuzhiyun *
65*4882a593Smuzhiyun * This is used for the validation of direct address memory reads.
66*4882a593Smuzhiyun */
67*4882a593Smuzhiyun uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
68*4882a593Smuzhiyun bool live_max_clamp_regs[LIVE_REG_COUNT];
69*4882a593Smuzhiyun uint32_t live_immediates[LIVE_REG_COUNT];
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun /* Bitfield of which IPs are used as branch targets.
72*4882a593Smuzhiyun *
73*4882a593Smuzhiyun * Used for validation that the uniform stream is updated at the right
74*4882a593Smuzhiyun * points and clearing the texturing/clamping state.
75*4882a593Smuzhiyun */
76*4882a593Smuzhiyun unsigned long *branch_targets;
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun /* Set when entering a basic block, and cleared when the uniform
79*4882a593Smuzhiyun * address update is found. This is used to make sure that we don't
80*4882a593Smuzhiyun * read uniforms when the address is undefined.
81*4882a593Smuzhiyun */
82*4882a593Smuzhiyun bool needs_uniform_address_update;
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun /* Set when we find a backwards branch. If the branch is backwards,
85*4882a593Smuzhiyun * the taraget is probably doing an address reset to read uniforms,
86*4882a593Smuzhiyun * and so we need to be sure that a uniforms address is present in the
87*4882a593Smuzhiyun * stream, even if the shader didn't need to read uniforms in later
88*4882a593Smuzhiyun * basic blocks.
89*4882a593Smuzhiyun */
90*4882a593Smuzhiyun bool needs_uniform_address_for_loop;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun /* Set when we find an instruction writing the top half of the
93*4882a593Smuzhiyun * register files. If we allowed writing the unusable regs in
94*4882a593Smuzhiyun * a threaded shader, then the other shader running on our
95*4882a593Smuzhiyun * QPU's clamp validation would be invalid.
96*4882a593Smuzhiyun */
97*4882a593Smuzhiyun bool all_registers_used;
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun static uint32_t
waddr_to_live_reg_index(uint32_t waddr,bool is_b)101*4882a593Smuzhiyun waddr_to_live_reg_index(uint32_t waddr, bool is_b)
102*4882a593Smuzhiyun {
103*4882a593Smuzhiyun if (waddr < 32) {
104*4882a593Smuzhiyun if (is_b)
105*4882a593Smuzhiyun return 32 + waddr;
106*4882a593Smuzhiyun else
107*4882a593Smuzhiyun return waddr;
108*4882a593Smuzhiyun } else if (waddr <= QPU_W_ACC3) {
109*4882a593Smuzhiyun return 64 + waddr - QPU_W_ACC0;
110*4882a593Smuzhiyun } else {
111*4882a593Smuzhiyun return ~0;
112*4882a593Smuzhiyun }
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun static uint32_t
raddr_add_a_to_live_reg_index(uint64_t inst)116*4882a593Smuzhiyun raddr_add_a_to_live_reg_index(uint64_t inst)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
119*4882a593Smuzhiyun uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
120*4882a593Smuzhiyun uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
121*4882a593Smuzhiyun uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun if (add_a == QPU_MUX_A)
124*4882a593Smuzhiyun return raddr_a;
125*4882a593Smuzhiyun else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
126*4882a593Smuzhiyun return 32 + raddr_b;
127*4882a593Smuzhiyun else if (add_a <= QPU_MUX_R3)
128*4882a593Smuzhiyun return 64 + add_a;
129*4882a593Smuzhiyun else
130*4882a593Smuzhiyun return ~0;
131*4882a593Smuzhiyun }
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun static bool
live_reg_is_upper_half(uint32_t lri)134*4882a593Smuzhiyun live_reg_is_upper_half(uint32_t lri)
135*4882a593Smuzhiyun {
136*4882a593Smuzhiyun return (lri >= 16 && lri < 32) ||
137*4882a593Smuzhiyun (lri >= 32 + 16 && lri < 32 + 32);
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun static bool
is_tmu_submit(uint32_t waddr)141*4882a593Smuzhiyun is_tmu_submit(uint32_t waddr)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun return (waddr == QPU_W_TMU0_S ||
144*4882a593Smuzhiyun waddr == QPU_W_TMU1_S);
145*4882a593Smuzhiyun }
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun static bool
is_tmu_write(uint32_t waddr)148*4882a593Smuzhiyun is_tmu_write(uint32_t waddr)
149*4882a593Smuzhiyun {
150*4882a593Smuzhiyun return (waddr >= QPU_W_TMU0_S &&
151*4882a593Smuzhiyun waddr <= QPU_W_TMU1_B);
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun static bool
record_texture_sample(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int tmu)155*4882a593Smuzhiyun record_texture_sample(struct vc4_validated_shader_info *validated_shader,
156*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state,
157*4882a593Smuzhiyun int tmu)
158*4882a593Smuzhiyun {
159*4882a593Smuzhiyun uint32_t s = validated_shader->num_texture_samples;
160*4882a593Smuzhiyun int i;
161*4882a593Smuzhiyun struct vc4_texture_sample_info *temp_samples;
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun temp_samples = krealloc(validated_shader->texture_samples,
164*4882a593Smuzhiyun (s + 1) * sizeof(*temp_samples),
165*4882a593Smuzhiyun GFP_KERNEL);
166*4882a593Smuzhiyun if (!temp_samples)
167*4882a593Smuzhiyun return false;
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun memcpy(&temp_samples[s],
170*4882a593Smuzhiyun &validation_state->tmu_setup[tmu],
171*4882a593Smuzhiyun sizeof(*temp_samples));
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun validated_shader->num_texture_samples = s + 1;
174*4882a593Smuzhiyun validated_shader->texture_samples = temp_samples;
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun for (i = 0; i < 4; i++)
177*4882a593Smuzhiyun validation_state->tmu_setup[tmu].p_offset[i] = ~0;
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun return true;
180*4882a593Smuzhiyun }
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun static bool
check_tmu_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)183*4882a593Smuzhiyun check_tmu_write(struct vc4_validated_shader_info *validated_shader,
184*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state,
185*4882a593Smuzhiyun bool is_mul)
186*4882a593Smuzhiyun {
187*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
188*4882a593Smuzhiyun uint32_t waddr = (is_mul ?
189*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
190*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_WADDR_ADD));
191*4882a593Smuzhiyun uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
192*4882a593Smuzhiyun uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
193*4882a593Smuzhiyun int tmu = waddr > QPU_W_TMU0_B;
194*4882a593Smuzhiyun bool submit = is_tmu_submit(waddr);
195*4882a593Smuzhiyun bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
196*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun if (is_direct) {
199*4882a593Smuzhiyun uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
200*4882a593Smuzhiyun uint32_t clamp_reg, clamp_offset;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun if (sig == QPU_SIG_SMALL_IMM) {
203*4882a593Smuzhiyun DRM_DEBUG("direct TMU read used small immediate\n");
204*4882a593Smuzhiyun return false;
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun /* Make sure that this texture load is an add of the base
208*4882a593Smuzhiyun * address of the UBO to a clamped offset within the UBO.
209*4882a593Smuzhiyun */
210*4882a593Smuzhiyun if (is_mul ||
211*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
212*4882a593Smuzhiyun DRM_DEBUG("direct TMU load wasn't an add\n");
213*4882a593Smuzhiyun return false;
214*4882a593Smuzhiyun }
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun /* We assert that the clamped address is the first
217*4882a593Smuzhiyun * argument, and the UBO base address is the second argument.
218*4882a593Smuzhiyun * This is arbitrary, but simpler than supporting flipping the
219*4882a593Smuzhiyun * two either way.
220*4882a593Smuzhiyun */
221*4882a593Smuzhiyun clamp_reg = raddr_add_a_to_live_reg_index(inst);
222*4882a593Smuzhiyun if (clamp_reg == ~0) {
223*4882a593Smuzhiyun DRM_DEBUG("direct TMU load wasn't clamped\n");
224*4882a593Smuzhiyun return false;
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
228*4882a593Smuzhiyun if (clamp_offset == ~0) {
229*4882a593Smuzhiyun DRM_DEBUG("direct TMU load wasn't clamped\n");
230*4882a593Smuzhiyun return false;
231*4882a593Smuzhiyun }
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun /* Store the clamp value's offset in p1 (see reloc_tex() in
234*4882a593Smuzhiyun * vc4_validate.c).
235*4882a593Smuzhiyun */
236*4882a593Smuzhiyun validation_state->tmu_setup[tmu].p_offset[1] =
237*4882a593Smuzhiyun clamp_offset;
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
240*4882a593Smuzhiyun !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
241*4882a593Smuzhiyun DRM_DEBUG("direct TMU load didn't add to a uniform\n");
242*4882a593Smuzhiyun return false;
243*4882a593Smuzhiyun }
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun validation_state->tmu_setup[tmu].is_direct = true;
246*4882a593Smuzhiyun } else {
247*4882a593Smuzhiyun if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
248*4882a593Smuzhiyun raddr_b == QPU_R_UNIF)) {
249*4882a593Smuzhiyun DRM_DEBUG("uniform read in the same instruction as "
250*4882a593Smuzhiyun "texture setup.\n");
251*4882a593Smuzhiyun return false;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun if (validation_state->tmu_write_count[tmu] >= 4) {
256*4882a593Smuzhiyun DRM_DEBUG("TMU%d got too many parameters before dispatch\n",
257*4882a593Smuzhiyun tmu);
258*4882a593Smuzhiyun return false;
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
261*4882a593Smuzhiyun validated_shader->uniforms_size;
262*4882a593Smuzhiyun validation_state->tmu_write_count[tmu]++;
263*4882a593Smuzhiyun /* Since direct uses a RADDR uniform reference, it will get counted in
264*4882a593Smuzhiyun * check_instruction_reads()
265*4882a593Smuzhiyun */
266*4882a593Smuzhiyun if (!is_direct) {
267*4882a593Smuzhiyun if (validation_state->needs_uniform_address_update) {
268*4882a593Smuzhiyun DRM_DEBUG("Texturing with undefined uniform address\n");
269*4882a593Smuzhiyun return false;
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun validated_shader->uniforms_size += 4;
273*4882a593Smuzhiyun }
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun if (submit) {
276*4882a593Smuzhiyun if (!record_texture_sample(validated_shader,
277*4882a593Smuzhiyun validation_state, tmu)) {
278*4882a593Smuzhiyun return false;
279*4882a593Smuzhiyun }
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun validation_state->tmu_write_count[tmu] = 0;
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun return true;
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun
require_uniform_address_uniform(struct vc4_validated_shader_info * validated_shader)287*4882a593Smuzhiyun static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun uint32_t o = validated_shader->num_uniform_addr_offsets;
290*4882a593Smuzhiyun uint32_t num_uniforms = validated_shader->uniforms_size / 4;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun validated_shader->uniform_addr_offsets =
293*4882a593Smuzhiyun krealloc(validated_shader->uniform_addr_offsets,
294*4882a593Smuzhiyun (o + 1) *
295*4882a593Smuzhiyun sizeof(*validated_shader->uniform_addr_offsets),
296*4882a593Smuzhiyun GFP_KERNEL);
297*4882a593Smuzhiyun if (!validated_shader->uniform_addr_offsets)
298*4882a593Smuzhiyun return false;
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun validated_shader->uniform_addr_offsets[o] = num_uniforms;
301*4882a593Smuzhiyun validated_shader->num_uniform_addr_offsets++;
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun return true;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun static bool
validate_uniform_address_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)307*4882a593Smuzhiyun validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
308*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state,
309*4882a593Smuzhiyun bool is_mul)
310*4882a593Smuzhiyun {
311*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
312*4882a593Smuzhiyun u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
313*4882a593Smuzhiyun u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
314*4882a593Smuzhiyun u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
315*4882a593Smuzhiyun u32 add_lri = raddr_add_a_to_live_reg_index(inst);
316*4882a593Smuzhiyun /* We want our reset to be pointing at whatever uniform follows the
317*4882a593Smuzhiyun * uniforms base address.
318*4882a593Smuzhiyun */
319*4882a593Smuzhiyun u32 expected_offset = validated_shader->uniforms_size + 4;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun /* We only support absolute uniform address changes, and we
322*4882a593Smuzhiyun * require that they be in the current basic block before any
323*4882a593Smuzhiyun * of its uniform reads.
324*4882a593Smuzhiyun *
325*4882a593Smuzhiyun * One could potentially emit more efficient QPU code, by
326*4882a593Smuzhiyun * noticing that (say) an if statement does uniform control
327*4882a593Smuzhiyun * flow for all threads and that the if reads the same number
328*4882a593Smuzhiyun * of uniforms on each side. However, this scheme is easy to
329*4882a593Smuzhiyun * validate so it's all we allow for now.
330*4882a593Smuzhiyun */
331*4882a593Smuzhiyun switch (QPU_GET_FIELD(inst, QPU_SIG)) {
332*4882a593Smuzhiyun case QPU_SIG_NONE:
333*4882a593Smuzhiyun case QPU_SIG_SCOREBOARD_UNLOCK:
334*4882a593Smuzhiyun case QPU_SIG_COLOR_LOAD:
335*4882a593Smuzhiyun case QPU_SIG_LOAD_TMU0:
336*4882a593Smuzhiyun case QPU_SIG_LOAD_TMU1:
337*4882a593Smuzhiyun break;
338*4882a593Smuzhiyun default:
339*4882a593Smuzhiyun DRM_DEBUG("uniforms address change must be "
340*4882a593Smuzhiyun "normal math\n");
341*4882a593Smuzhiyun return false;
342*4882a593Smuzhiyun }
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
345*4882a593Smuzhiyun DRM_DEBUG("Uniform address reset must be an ADD.\n");
346*4882a593Smuzhiyun return false;
347*4882a593Smuzhiyun }
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
350*4882a593Smuzhiyun DRM_DEBUG("Uniform address reset must be unconditional.\n");
351*4882a593Smuzhiyun return false;
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
355*4882a593Smuzhiyun !(inst & QPU_PM)) {
356*4882a593Smuzhiyun DRM_DEBUG("No packing allowed on uniforms reset\n");
357*4882a593Smuzhiyun return false;
358*4882a593Smuzhiyun }
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun if (add_lri == -1) {
361*4882a593Smuzhiyun DRM_DEBUG("First argument of uniform address write must be "
362*4882a593Smuzhiyun "an immediate value.\n");
363*4882a593Smuzhiyun return false;
364*4882a593Smuzhiyun }
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun if (validation_state->live_immediates[add_lri] != expected_offset) {
367*4882a593Smuzhiyun DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n",
368*4882a593Smuzhiyun validation_state->live_immediates[add_lri],
369*4882a593Smuzhiyun expected_offset);
370*4882a593Smuzhiyun return false;
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
374*4882a593Smuzhiyun !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
375*4882a593Smuzhiyun DRM_DEBUG("Second argument of uniform address write must be "
376*4882a593Smuzhiyun "a uniform.\n");
377*4882a593Smuzhiyun return false;
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun validation_state->needs_uniform_address_update = false;
381*4882a593Smuzhiyun validation_state->needs_uniform_address_for_loop = false;
382*4882a593Smuzhiyun return require_uniform_address_uniform(validated_shader);
383*4882a593Smuzhiyun }
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun static bool
check_reg_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)386*4882a593Smuzhiyun check_reg_write(struct vc4_validated_shader_info *validated_shader,
387*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state,
388*4882a593Smuzhiyun bool is_mul)
389*4882a593Smuzhiyun {
390*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
391*4882a593Smuzhiyun uint32_t waddr = (is_mul ?
392*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
393*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_WADDR_ADD));
394*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
395*4882a593Smuzhiyun bool ws = inst & QPU_WS;
396*4882a593Smuzhiyun bool is_b = is_mul ^ ws;
397*4882a593Smuzhiyun u32 lri = waddr_to_live_reg_index(waddr, is_b);
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun if (lri != -1) {
400*4882a593Smuzhiyun uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
401*4882a593Smuzhiyun uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun if (sig == QPU_SIG_LOAD_IMM &&
404*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
405*4882a593Smuzhiyun ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
406*4882a593Smuzhiyun (!is_mul && cond_add == QPU_COND_ALWAYS))) {
407*4882a593Smuzhiyun validation_state->live_immediates[lri] =
408*4882a593Smuzhiyun QPU_GET_FIELD(inst, QPU_LOAD_IMM);
409*4882a593Smuzhiyun } else {
410*4882a593Smuzhiyun validation_state->live_immediates[lri] = ~0;
411*4882a593Smuzhiyun }
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun if (live_reg_is_upper_half(lri))
414*4882a593Smuzhiyun validation_state->all_registers_used = true;
415*4882a593Smuzhiyun }
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun switch (waddr) {
418*4882a593Smuzhiyun case QPU_W_UNIFORMS_ADDRESS:
419*4882a593Smuzhiyun if (is_b) {
420*4882a593Smuzhiyun DRM_DEBUG("relative uniforms address change "
421*4882a593Smuzhiyun "unsupported\n");
422*4882a593Smuzhiyun return false;
423*4882a593Smuzhiyun }
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun return validate_uniform_address_write(validated_shader,
426*4882a593Smuzhiyun validation_state,
427*4882a593Smuzhiyun is_mul);
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun case QPU_W_TLB_COLOR_MS:
430*4882a593Smuzhiyun case QPU_W_TLB_COLOR_ALL:
431*4882a593Smuzhiyun case QPU_W_TLB_Z:
432*4882a593Smuzhiyun /* These only interact with the tile buffer, not main memory,
433*4882a593Smuzhiyun * so they're safe.
434*4882a593Smuzhiyun */
435*4882a593Smuzhiyun return true;
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun case QPU_W_TMU0_S:
438*4882a593Smuzhiyun case QPU_W_TMU0_T:
439*4882a593Smuzhiyun case QPU_W_TMU0_R:
440*4882a593Smuzhiyun case QPU_W_TMU0_B:
441*4882a593Smuzhiyun case QPU_W_TMU1_S:
442*4882a593Smuzhiyun case QPU_W_TMU1_T:
443*4882a593Smuzhiyun case QPU_W_TMU1_R:
444*4882a593Smuzhiyun case QPU_W_TMU1_B:
445*4882a593Smuzhiyun return check_tmu_write(validated_shader, validation_state,
446*4882a593Smuzhiyun is_mul);
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun case QPU_W_HOST_INT:
449*4882a593Smuzhiyun case QPU_W_TMU_NOSWAP:
450*4882a593Smuzhiyun case QPU_W_TLB_ALPHA_MASK:
451*4882a593Smuzhiyun case QPU_W_MUTEX_RELEASE:
452*4882a593Smuzhiyun /* XXX: I haven't thought about these, so don't support them
453*4882a593Smuzhiyun * for now.
454*4882a593Smuzhiyun */
455*4882a593Smuzhiyun DRM_DEBUG("Unsupported waddr %d\n", waddr);
456*4882a593Smuzhiyun return false;
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun case QPU_W_VPM_ADDR:
459*4882a593Smuzhiyun DRM_DEBUG("General VPM DMA unsupported\n");
460*4882a593Smuzhiyun return false;
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun case QPU_W_VPM:
463*4882a593Smuzhiyun case QPU_W_VPMVCD_SETUP:
464*4882a593Smuzhiyun /* We allow VPM setup in general, even including VPM DMA
465*4882a593Smuzhiyun * configuration setup, because the (unsafe) DMA can only be
466*4882a593Smuzhiyun * triggered by QPU_W_VPM_ADDR writes.
467*4882a593Smuzhiyun */
468*4882a593Smuzhiyun return true;
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun case QPU_W_TLB_STENCIL_SETUP:
471*4882a593Smuzhiyun return true;
472*4882a593Smuzhiyun }
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun return true;
475*4882a593Smuzhiyun }
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun static void
track_live_clamps(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)478*4882a593Smuzhiyun track_live_clamps(struct vc4_validated_shader_info *validated_shader,
479*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state)
480*4882a593Smuzhiyun {
481*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
482*4882a593Smuzhiyun uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
483*4882a593Smuzhiyun uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
484*4882a593Smuzhiyun uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
485*4882a593Smuzhiyun uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
486*4882a593Smuzhiyun uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
487*4882a593Smuzhiyun uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
488*4882a593Smuzhiyun uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
489*4882a593Smuzhiyun uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
490*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
491*4882a593Smuzhiyun bool ws = inst & QPU_WS;
492*4882a593Smuzhiyun uint32_t lri_add_a, lri_add, lri_mul;
493*4882a593Smuzhiyun bool add_a_is_min_0;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
496*4882a593Smuzhiyun * before we clear previous live state.
497*4882a593Smuzhiyun */
498*4882a593Smuzhiyun lri_add_a = raddr_add_a_to_live_reg_index(inst);
499*4882a593Smuzhiyun add_a_is_min_0 = (lri_add_a != ~0 &&
500*4882a593Smuzhiyun validation_state->live_max_clamp_regs[lri_add_a]);
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun /* Clear live state for registers written by our instruction. */
503*4882a593Smuzhiyun lri_add = waddr_to_live_reg_index(waddr_add, ws);
504*4882a593Smuzhiyun lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
505*4882a593Smuzhiyun if (lri_mul != ~0) {
506*4882a593Smuzhiyun validation_state->live_max_clamp_regs[lri_mul] = false;
507*4882a593Smuzhiyun validation_state->live_min_clamp_offsets[lri_mul] = ~0;
508*4882a593Smuzhiyun }
509*4882a593Smuzhiyun if (lri_add != ~0) {
510*4882a593Smuzhiyun validation_state->live_max_clamp_regs[lri_add] = false;
511*4882a593Smuzhiyun validation_state->live_min_clamp_offsets[lri_add] = ~0;
512*4882a593Smuzhiyun } else {
513*4882a593Smuzhiyun /* Nothing further to do for live tracking, since only ADDs
514*4882a593Smuzhiyun * generate new live clamp registers.
515*4882a593Smuzhiyun */
516*4882a593Smuzhiyun return;
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun /* Now, handle remaining live clamp tracking for the ADD operation. */
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun if (cond_add != QPU_COND_ALWAYS)
522*4882a593Smuzhiyun return;
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun if (op_add == QPU_A_MAX) {
525*4882a593Smuzhiyun /* Track live clamps of a value to a minimum of 0 (in either
526*4882a593Smuzhiyun * arg).
527*4882a593Smuzhiyun */
528*4882a593Smuzhiyun if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
529*4882a593Smuzhiyun (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
530*4882a593Smuzhiyun return;
531*4882a593Smuzhiyun }
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun validation_state->live_max_clamp_regs[lri_add] = true;
534*4882a593Smuzhiyun } else if (op_add == QPU_A_MIN) {
535*4882a593Smuzhiyun /* Track live clamps of a value clamped to a minimum of 0 and
536*4882a593Smuzhiyun * a maximum of some uniform's offset.
537*4882a593Smuzhiyun */
538*4882a593Smuzhiyun if (!add_a_is_min_0)
539*4882a593Smuzhiyun return;
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
542*4882a593Smuzhiyun !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
543*4882a593Smuzhiyun sig != QPU_SIG_SMALL_IMM)) {
544*4882a593Smuzhiyun return;
545*4882a593Smuzhiyun }
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun validation_state->live_min_clamp_offsets[lri_add] =
548*4882a593Smuzhiyun validated_shader->uniforms_size;
549*4882a593Smuzhiyun }
550*4882a593Smuzhiyun }
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun static bool
check_instruction_writes(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)553*4882a593Smuzhiyun check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
554*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state)
555*4882a593Smuzhiyun {
556*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
557*4882a593Smuzhiyun uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
558*4882a593Smuzhiyun uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
559*4882a593Smuzhiyun bool ok;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
562*4882a593Smuzhiyun DRM_DEBUG("ADD and MUL both set up textures\n");
563*4882a593Smuzhiyun return false;
564*4882a593Smuzhiyun }
565*4882a593Smuzhiyun
566*4882a593Smuzhiyun ok = (check_reg_write(validated_shader, validation_state, false) &&
567*4882a593Smuzhiyun check_reg_write(validated_shader, validation_state, true));
568*4882a593Smuzhiyun
569*4882a593Smuzhiyun track_live_clamps(validated_shader, validation_state);
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun return ok;
572*4882a593Smuzhiyun }
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun static bool
check_branch(uint64_t inst,struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int ip)575*4882a593Smuzhiyun check_branch(uint64_t inst,
576*4882a593Smuzhiyun struct vc4_validated_shader_info *validated_shader,
577*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state,
578*4882a593Smuzhiyun int ip)
579*4882a593Smuzhiyun {
580*4882a593Smuzhiyun int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
581*4882a593Smuzhiyun uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
582*4882a593Smuzhiyun uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun if ((int)branch_imm < 0)
585*4882a593Smuzhiyun validation_state->needs_uniform_address_for_loop = true;
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun /* We don't want to have to worry about validation of this, and
588*4882a593Smuzhiyun * there's no need for it.
589*4882a593Smuzhiyun */
590*4882a593Smuzhiyun if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
591*4882a593Smuzhiyun DRM_DEBUG("branch instruction at %d wrote a register.\n",
592*4882a593Smuzhiyun validation_state->ip);
593*4882a593Smuzhiyun return false;
594*4882a593Smuzhiyun }
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun return true;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun static bool
check_instruction_reads(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)600*4882a593Smuzhiyun check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
601*4882a593Smuzhiyun struct vc4_shader_validation_state *validation_state)
602*4882a593Smuzhiyun {
603*4882a593Smuzhiyun uint64_t inst = validation_state->shader[validation_state->ip];
604*4882a593Smuzhiyun uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
605*4882a593Smuzhiyun uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
606*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun if (raddr_a == QPU_R_UNIF ||
609*4882a593Smuzhiyun (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
610*4882a593Smuzhiyun /* This can't overflow the uint32_t, because we're reading 8
611*4882a593Smuzhiyun * bytes of instruction to increment by 4 here, so we'd
612*4882a593Smuzhiyun * already be OOM.
613*4882a593Smuzhiyun */
614*4882a593Smuzhiyun validated_shader->uniforms_size += 4;
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun if (validation_state->needs_uniform_address_update) {
617*4882a593Smuzhiyun DRM_DEBUG("Uniform read with undefined uniform "
618*4882a593Smuzhiyun "address\n");
619*4882a593Smuzhiyun return false;
620*4882a593Smuzhiyun }
621*4882a593Smuzhiyun }
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun if ((raddr_a >= 16 && raddr_a < 32) ||
624*4882a593Smuzhiyun (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
625*4882a593Smuzhiyun validation_state->all_registers_used = true;
626*4882a593Smuzhiyun }
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun return true;
629*4882a593Smuzhiyun }
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun /* Make sure that all branches are absolute and point within the shader, and
632*4882a593Smuzhiyun * note their targets for later.
633*4882a593Smuzhiyun */
634*4882a593Smuzhiyun static bool
vc4_validate_branches(struct vc4_shader_validation_state * validation_state)635*4882a593Smuzhiyun vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
636*4882a593Smuzhiyun {
637*4882a593Smuzhiyun uint32_t max_branch_target = 0;
638*4882a593Smuzhiyun int ip;
639*4882a593Smuzhiyun int last_branch = -2;
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun for (ip = 0; ip < validation_state->max_ip; ip++) {
642*4882a593Smuzhiyun uint64_t inst = validation_state->shader[ip];
643*4882a593Smuzhiyun int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
644*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
645*4882a593Smuzhiyun uint32_t after_delay_ip = ip + 4;
646*4882a593Smuzhiyun uint32_t branch_target_ip;
647*4882a593Smuzhiyun
648*4882a593Smuzhiyun if (sig == QPU_SIG_PROG_END) {
649*4882a593Smuzhiyun /* There are two delay slots after program end is
650*4882a593Smuzhiyun * signaled that are still executed, then we're
651*4882a593Smuzhiyun * finished. validation_state->max_ip is the
652*4882a593Smuzhiyun * instruction after the last valid instruction in the
653*4882a593Smuzhiyun * program.
654*4882a593Smuzhiyun */
655*4882a593Smuzhiyun validation_state->max_ip = ip + 3;
656*4882a593Smuzhiyun continue;
657*4882a593Smuzhiyun }
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun if (sig != QPU_SIG_BRANCH)
660*4882a593Smuzhiyun continue;
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun if (ip - last_branch < 4) {
663*4882a593Smuzhiyun DRM_DEBUG("Branch at %d during delay slots\n", ip);
664*4882a593Smuzhiyun return false;
665*4882a593Smuzhiyun }
666*4882a593Smuzhiyun last_branch = ip;
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun if (inst & QPU_BRANCH_REG) {
669*4882a593Smuzhiyun DRM_DEBUG("branching from register relative "
670*4882a593Smuzhiyun "not supported\n");
671*4882a593Smuzhiyun return false;
672*4882a593Smuzhiyun }
673*4882a593Smuzhiyun
674*4882a593Smuzhiyun if (!(inst & QPU_BRANCH_REL)) {
675*4882a593Smuzhiyun DRM_DEBUG("relative branching required\n");
676*4882a593Smuzhiyun return false;
677*4882a593Smuzhiyun }
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun /* The actual branch target is the instruction after the delay
680*4882a593Smuzhiyun * slots, plus whatever byte offset is in the low 32 bits of
681*4882a593Smuzhiyun * the instruction. Make sure we're not branching beyond the
682*4882a593Smuzhiyun * end of the shader object.
683*4882a593Smuzhiyun */
684*4882a593Smuzhiyun if (branch_imm % sizeof(inst) != 0) {
685*4882a593Smuzhiyun DRM_DEBUG("branch target not aligned\n");
686*4882a593Smuzhiyun return false;
687*4882a593Smuzhiyun }
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun branch_target_ip = after_delay_ip + (branch_imm >> 3);
690*4882a593Smuzhiyun if (branch_target_ip >= validation_state->max_ip) {
691*4882a593Smuzhiyun DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n",
692*4882a593Smuzhiyun ip, branch_target_ip,
693*4882a593Smuzhiyun validation_state->max_ip);
694*4882a593Smuzhiyun return false;
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun set_bit(branch_target_ip, validation_state->branch_targets);
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun /* Make sure that the non-branching path is also not outside
699*4882a593Smuzhiyun * the shader.
700*4882a593Smuzhiyun */
701*4882a593Smuzhiyun if (after_delay_ip >= validation_state->max_ip) {
702*4882a593Smuzhiyun DRM_DEBUG("Branch at %d continues past shader end "
703*4882a593Smuzhiyun "(%d/%d)\n",
704*4882a593Smuzhiyun ip, after_delay_ip, validation_state->max_ip);
705*4882a593Smuzhiyun return false;
706*4882a593Smuzhiyun }
707*4882a593Smuzhiyun set_bit(after_delay_ip, validation_state->branch_targets);
708*4882a593Smuzhiyun max_branch_target = max(max_branch_target, after_delay_ip);
709*4882a593Smuzhiyun }
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun if (max_branch_target > validation_state->max_ip - 3) {
712*4882a593Smuzhiyun DRM_DEBUG("Branch landed after QPU_SIG_PROG_END");
713*4882a593Smuzhiyun return false;
714*4882a593Smuzhiyun }
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun return true;
717*4882a593Smuzhiyun }
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun /* Resets any known state for the shader, used when we may be branched to from
720*4882a593Smuzhiyun * multiple locations in the program (or at shader start).
721*4882a593Smuzhiyun */
722*4882a593Smuzhiyun static void
reset_validation_state(struct vc4_shader_validation_state * validation_state)723*4882a593Smuzhiyun reset_validation_state(struct vc4_shader_validation_state *validation_state)
724*4882a593Smuzhiyun {
725*4882a593Smuzhiyun int i;
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun for (i = 0; i < 8; i++)
728*4882a593Smuzhiyun validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
729*4882a593Smuzhiyun
730*4882a593Smuzhiyun for (i = 0; i < LIVE_REG_COUNT; i++) {
731*4882a593Smuzhiyun validation_state->live_min_clamp_offsets[i] = ~0;
732*4882a593Smuzhiyun validation_state->live_max_clamp_regs[i] = false;
733*4882a593Smuzhiyun validation_state->live_immediates[i] = ~0;
734*4882a593Smuzhiyun }
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun static bool
texturing_in_progress(struct vc4_shader_validation_state * validation_state)738*4882a593Smuzhiyun texturing_in_progress(struct vc4_shader_validation_state *validation_state)
739*4882a593Smuzhiyun {
740*4882a593Smuzhiyun return (validation_state->tmu_write_count[0] != 0 ||
741*4882a593Smuzhiyun validation_state->tmu_write_count[1] != 0);
742*4882a593Smuzhiyun }
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun static bool
vc4_handle_branch_target(struct vc4_shader_validation_state * validation_state)745*4882a593Smuzhiyun vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
746*4882a593Smuzhiyun {
747*4882a593Smuzhiyun uint32_t ip = validation_state->ip;
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun if (!test_bit(ip, validation_state->branch_targets))
750*4882a593Smuzhiyun return true;
751*4882a593Smuzhiyun
752*4882a593Smuzhiyun if (texturing_in_progress(validation_state)) {
753*4882a593Smuzhiyun DRM_DEBUG("Branch target landed during TMU setup\n");
754*4882a593Smuzhiyun return false;
755*4882a593Smuzhiyun }
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun /* Reset our live values tracking, since this instruction may have
758*4882a593Smuzhiyun * multiple predecessors.
759*4882a593Smuzhiyun *
760*4882a593Smuzhiyun * One could potentially do analysis to determine that, for
761*4882a593Smuzhiyun * example, all predecessors have a live max clamp in the same
762*4882a593Smuzhiyun * register, but we don't bother with that.
763*4882a593Smuzhiyun */
764*4882a593Smuzhiyun reset_validation_state(validation_state);
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun /* Since we've entered a basic block from potentially multiple
767*4882a593Smuzhiyun * predecessors, we need the uniforms address to be updated before any
768*4882a593Smuzhiyun * unforms are read. We require that after any branch point, the next
769*4882a593Smuzhiyun * uniform to be loaded is a uniform address offset. That uniform's
770*4882a593Smuzhiyun * offset will be marked by the uniform address register write
771*4882a593Smuzhiyun * validation, or a one-off the end-of-program check.
772*4882a593Smuzhiyun */
773*4882a593Smuzhiyun validation_state->needs_uniform_address_update = true;
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun return true;
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun
778*4882a593Smuzhiyun struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_cma_object * shader_obj)779*4882a593Smuzhiyun vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
780*4882a593Smuzhiyun {
781*4882a593Smuzhiyun bool found_shader_end = false;
782*4882a593Smuzhiyun int shader_end_ip = 0;
783*4882a593Smuzhiyun uint32_t last_thread_switch_ip = -3;
784*4882a593Smuzhiyun uint32_t ip;
785*4882a593Smuzhiyun struct vc4_validated_shader_info *validated_shader = NULL;
786*4882a593Smuzhiyun struct vc4_shader_validation_state validation_state;
787*4882a593Smuzhiyun
788*4882a593Smuzhiyun memset(&validation_state, 0, sizeof(validation_state));
789*4882a593Smuzhiyun validation_state.shader = shader_obj->vaddr;
790*4882a593Smuzhiyun validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun reset_validation_state(&validation_state);
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun validation_state.branch_targets =
795*4882a593Smuzhiyun kcalloc(BITS_TO_LONGS(validation_state.max_ip),
796*4882a593Smuzhiyun sizeof(unsigned long), GFP_KERNEL);
797*4882a593Smuzhiyun if (!validation_state.branch_targets)
798*4882a593Smuzhiyun goto fail;
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
801*4882a593Smuzhiyun if (!validated_shader)
802*4882a593Smuzhiyun goto fail;
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun if (!vc4_validate_branches(&validation_state))
805*4882a593Smuzhiyun goto fail;
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun for (ip = 0; ip < validation_state.max_ip; ip++) {
808*4882a593Smuzhiyun uint64_t inst = validation_state.shader[ip];
809*4882a593Smuzhiyun uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun validation_state.ip = ip;
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun if (!vc4_handle_branch_target(&validation_state))
814*4882a593Smuzhiyun goto fail;
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun if (ip == last_thread_switch_ip + 3) {
817*4882a593Smuzhiyun /* Reset r0-r3 live clamp data */
818*4882a593Smuzhiyun int i;
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun for (i = 64; i < LIVE_REG_COUNT; i++) {
821*4882a593Smuzhiyun validation_state.live_min_clamp_offsets[i] = ~0;
822*4882a593Smuzhiyun validation_state.live_max_clamp_regs[i] = false;
823*4882a593Smuzhiyun validation_state.live_immediates[i] = ~0;
824*4882a593Smuzhiyun }
825*4882a593Smuzhiyun }
826*4882a593Smuzhiyun
827*4882a593Smuzhiyun switch (sig) {
828*4882a593Smuzhiyun case QPU_SIG_NONE:
829*4882a593Smuzhiyun case QPU_SIG_WAIT_FOR_SCOREBOARD:
830*4882a593Smuzhiyun case QPU_SIG_SCOREBOARD_UNLOCK:
831*4882a593Smuzhiyun case QPU_SIG_COLOR_LOAD:
832*4882a593Smuzhiyun case QPU_SIG_LOAD_TMU0:
833*4882a593Smuzhiyun case QPU_SIG_LOAD_TMU1:
834*4882a593Smuzhiyun case QPU_SIG_PROG_END:
835*4882a593Smuzhiyun case QPU_SIG_SMALL_IMM:
836*4882a593Smuzhiyun case QPU_SIG_THREAD_SWITCH:
837*4882a593Smuzhiyun case QPU_SIG_LAST_THREAD_SWITCH:
838*4882a593Smuzhiyun if (!check_instruction_writes(validated_shader,
839*4882a593Smuzhiyun &validation_state)) {
840*4882a593Smuzhiyun DRM_DEBUG("Bad write at ip %d\n", ip);
841*4882a593Smuzhiyun goto fail;
842*4882a593Smuzhiyun }
843*4882a593Smuzhiyun
844*4882a593Smuzhiyun if (!check_instruction_reads(validated_shader,
845*4882a593Smuzhiyun &validation_state))
846*4882a593Smuzhiyun goto fail;
847*4882a593Smuzhiyun
848*4882a593Smuzhiyun if (sig == QPU_SIG_PROG_END) {
849*4882a593Smuzhiyun found_shader_end = true;
850*4882a593Smuzhiyun shader_end_ip = ip;
851*4882a593Smuzhiyun }
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun if (sig == QPU_SIG_THREAD_SWITCH ||
854*4882a593Smuzhiyun sig == QPU_SIG_LAST_THREAD_SWITCH) {
855*4882a593Smuzhiyun validated_shader->is_threaded = true;
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun if (ip < last_thread_switch_ip + 3) {
858*4882a593Smuzhiyun DRM_DEBUG("Thread switch too soon after "
859*4882a593Smuzhiyun "last switch at ip %d\n", ip);
860*4882a593Smuzhiyun goto fail;
861*4882a593Smuzhiyun }
862*4882a593Smuzhiyun last_thread_switch_ip = ip;
863*4882a593Smuzhiyun }
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun break;
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun case QPU_SIG_LOAD_IMM:
868*4882a593Smuzhiyun if (!check_instruction_writes(validated_shader,
869*4882a593Smuzhiyun &validation_state)) {
870*4882a593Smuzhiyun DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip);
871*4882a593Smuzhiyun goto fail;
872*4882a593Smuzhiyun }
873*4882a593Smuzhiyun break;
874*4882a593Smuzhiyun
875*4882a593Smuzhiyun case QPU_SIG_BRANCH:
876*4882a593Smuzhiyun if (!check_branch(inst, validated_shader,
877*4882a593Smuzhiyun &validation_state, ip))
878*4882a593Smuzhiyun goto fail;
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun if (ip < last_thread_switch_ip + 3) {
881*4882a593Smuzhiyun DRM_DEBUG("Branch in thread switch at ip %d",
882*4882a593Smuzhiyun ip);
883*4882a593Smuzhiyun goto fail;
884*4882a593Smuzhiyun }
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun break;
887*4882a593Smuzhiyun default:
888*4882a593Smuzhiyun DRM_DEBUG("Unsupported QPU signal %d at "
889*4882a593Smuzhiyun "instruction %d\n", sig, ip);
890*4882a593Smuzhiyun goto fail;
891*4882a593Smuzhiyun }
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun /* There are two delay slots after program end is signaled
894*4882a593Smuzhiyun * that are still executed, then we're finished.
895*4882a593Smuzhiyun */
896*4882a593Smuzhiyun if (found_shader_end && ip == shader_end_ip + 2)
897*4882a593Smuzhiyun break;
898*4882a593Smuzhiyun }
899*4882a593Smuzhiyun
900*4882a593Smuzhiyun if (ip == validation_state.max_ip) {
901*4882a593Smuzhiyun DRM_DEBUG("shader failed to terminate before "
902*4882a593Smuzhiyun "shader BO end at %zd\n",
903*4882a593Smuzhiyun shader_obj->base.size);
904*4882a593Smuzhiyun goto fail;
905*4882a593Smuzhiyun }
906*4882a593Smuzhiyun
907*4882a593Smuzhiyun /* Might corrupt other thread */
908*4882a593Smuzhiyun if (validated_shader->is_threaded &&
909*4882a593Smuzhiyun validation_state.all_registers_used) {
910*4882a593Smuzhiyun DRM_DEBUG("Shader uses threading, but uses the upper "
911*4882a593Smuzhiyun "half of the registers, too\n");
912*4882a593Smuzhiyun goto fail;
913*4882a593Smuzhiyun }
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun /* If we did a backwards branch and we haven't emitted a uniforms
916*4882a593Smuzhiyun * reset since then, we still need the uniforms stream to have the
917*4882a593Smuzhiyun * uniforms address available so that the backwards branch can do its
918*4882a593Smuzhiyun * uniforms reset.
919*4882a593Smuzhiyun *
920*4882a593Smuzhiyun * We could potentially prove that the backwards branch doesn't
921*4882a593Smuzhiyun * contain any uses of uniforms until program exit, but that doesn't
922*4882a593Smuzhiyun * seem to be worth the trouble.
923*4882a593Smuzhiyun */
924*4882a593Smuzhiyun if (validation_state.needs_uniform_address_for_loop) {
925*4882a593Smuzhiyun if (!require_uniform_address_uniform(validated_shader))
926*4882a593Smuzhiyun goto fail;
927*4882a593Smuzhiyun validated_shader->uniforms_size += 4;
928*4882a593Smuzhiyun }
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun /* Again, no chance of integer overflow here because the worst case
931*4882a593Smuzhiyun * scenario is 8 bytes of uniforms plus handles per 8-byte
932*4882a593Smuzhiyun * instruction.
933*4882a593Smuzhiyun */
934*4882a593Smuzhiyun validated_shader->uniforms_src_size =
935*4882a593Smuzhiyun (validated_shader->uniforms_size +
936*4882a593Smuzhiyun 4 * validated_shader->num_texture_samples);
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun kfree(validation_state.branch_targets);
939*4882a593Smuzhiyun
940*4882a593Smuzhiyun return validated_shader;
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun fail:
943*4882a593Smuzhiyun kfree(validation_state.branch_targets);
944*4882a593Smuzhiyun if (validated_shader) {
945*4882a593Smuzhiyun kfree(validated_shader->uniform_addr_offsets);
946*4882a593Smuzhiyun kfree(validated_shader->texture_samples);
947*4882a593Smuzhiyun kfree(validated_shader);
948*4882a593Smuzhiyun }
949*4882a593Smuzhiyun return NULL;
950*4882a593Smuzhiyun }
951