1 /**
2 * @ClassName yolo_image
3 * @Description inference code for yolo
4 * @Author raul.rao
5 * @Date 2022/5/23 11:10
6 * @Version 1.0
7 */
8
9 #include <cstdarg>
10 #include <cstdio>
11 #include <cstdlib>
12 #include <fstream>
13 #include <iostream>
14 #include <memory>
15 #include <sstream>
16 #include <string>
17 #include <vector>
18 #include <ctime>
19
20 #include <cstdint>
21
22 #include "rknn_api.h"
23
24 #include "yolo_image.h"
25 #include "rga/rga.h"
26 #include "rga/im2d.h"
27 #include "rga/im2d_version.h"
28 #include "post_process.h"
29
30 //#define DEBUG_DUMP
31 //#define EVAL_TIME
32 #define ZERO_COPY 1
33 #define DO_NOT_FLIP -1
34
35 int g_inf_count = 0;
36
37 int g_post_count = 0;
38
39 rknn_context ctx = 0;
40
41 bool created = false;
42
43 int img_width = 0; // the width of the actual input image
44 int img_height = 0; // the height of the actual input image
45
46 int m_in_width = 0; // the width of the RKNN model input
47 int m_in_height = 0; // the height of the RKNN model input
48 int m_in_channel = 0; // the channel of the RKNN model input
49
50 float scale_w = 0.0;
51 float scale_h = 0.0;
52
53 uint32_t n_input = 1;
54 uint32_t n_output = 3;
55
56 rknn_tensor_attr input_attrs[1];
57 rknn_tensor_attr output_attrs[3];
58
59 rknn_tensor_mem *input_mems[1];
60 rknn_tensor_mem *output_mems[3];
61
62 rga_buffer_t g_rga_src;
63 rga_buffer_t g_rga_dst;
64
65 std::vector<float> out_scales;
66 std::vector<int32_t> out_zps;
67
__get_us(struct timeval t)68 double __get_us(struct timeval t) { return (t.tv_sec * 1000000 + t.tv_usec); }
69
70
create(int im_height,int im_width,int im_channel,char * model_path)71 int create(int im_height, int im_width, int im_channel, char *model_path)
72 {
73 img_height = im_height;
74 img_width = im_width;
75
76 LOGI("try rknn_init!")
77
78 // 0. RGA version check
79 LOGI("RGA API Version: %s", RGA_API_VERSION)
80 // Please refer to the link to confirm the RGA driver version, make sure it is higher than 1.2.4
81 // https://github.com/airockchip/librga/blob/main/docs/Rockchip_FAQ_RGA_CN.md#rga-driver
82
83 // 1. Load model
84 FILE *fp = fopen(model_path, "rb");
85 if(fp == NULL) {
86 LOGE("fopen %s fail!\n", model_path);
87 return -1;
88 }
89 fseek(fp, 0, SEEK_END);
90 uint32_t model_len = ftell(fp);
91 void *model = malloc(model_len);
92 fseek(fp, 0, SEEK_SET);
93 if(model_len != fread(model, 1, model_len, fp)) {
94 LOGE("fread %s fail!\n", model_path);
95 free(model);
96 fclose(fp);
97 return -1;
98 }
99
100 fclose(fp);
101
102 // 2. Init RKNN model
103 int ret = rknn_init(&ctx, model, model_len, 0, nullptr);
104 free(model);
105
106 if(ret < 0) {
107 LOGE("rknn_init fail! ret=%d\n", ret);
108 return -1;
109 }
110
111 // 3. Query input/output attr.
112 rknn_input_output_num io_num;
113 rknn_query_cmd cmd = RKNN_QUERY_IN_OUT_NUM;
114 // 3.1 Query input/output num.
115 ret = rknn_query(ctx, cmd, &io_num, sizeof(io_num));
116 if (ret != RKNN_SUCC) {
117 LOGE("rknn_query io_num fail!ret=%d\n", ret);
118 return -1;
119 }
120 n_input = io_num.n_input;
121 n_output = io_num.n_output;
122
123 // 3.2 Query input attributes
124 memset(input_attrs, 0, n_input * sizeof(rknn_tensor_attr));
125 for (int i = 0; i < n_input; ++i) {
126 input_attrs[i].index = i;
127 cmd = RKNN_QUERY_INPUT_ATTR;
128 ret = rknn_query(ctx, cmd, &(input_attrs[i]), sizeof(rknn_tensor_attr));
129 if (ret < 0) {
130 LOGE("rknn_query input_attrs[%d] fail!ret=%d\n", i, ret);
131 return -1;
132 }
133 }
134 // 3.2.0 Update global model input shape.
135 if (RKNN_TENSOR_NHWC == input_attrs[0].fmt) {
136 m_in_height = input_attrs[0].dims[1];
137 m_in_width = input_attrs[0].dims[2];
138 m_in_channel = input_attrs[0].dims[3];
139 } else if (RKNN_TENSOR_NCHW == input_attrs[0].fmt) {
140 m_in_height = input_attrs[0].dims[2];
141 m_in_width = input_attrs[0].dims[3];
142 m_in_channel = input_attrs[0].dims[1];
143 } else {
144 LOGE("Unsupported model input layout: %d!\n", input_attrs[0].fmt);
145 return -1;
146 }
147
148 // set scale_w, scale_h for post process
149 scale_w = (float)m_in_width / img_width;
150 scale_h = (float)m_in_height / img_height;
151
152 // 3.3 Query output attributes
153 memset(output_attrs, 0, n_output * sizeof(rknn_tensor_attr));
154 for (int i = 0; i < n_output; ++i) {
155 output_attrs[i].index = i;
156 cmd = RKNN_QUERY_OUTPUT_ATTR;
157 ret = rknn_query(ctx, cmd, &(output_attrs[i]), sizeof(rknn_tensor_attr));
158 if (ret < 0) {
159 LOGE("rknn_query output_attrs[%d] fail!ret=%d\n", i, ret);
160 return -1;
161 }
162 // set out_scales/out_zps for post_process
163 out_scales.push_back(output_attrs[i].scale);
164 out_zps.push_back(output_attrs[i].zp);
165 }
166
167 #if ZERO_COPY
168 // 4. Set input/output buffer
169 // 4.1 Set inputs memory
170 // 4.1.1 Create input tensor memory, input data type is INT8, yolo has only 1 input.
171 input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride * sizeof(char));
172 memset(input_mems[0]->virt_addr, 0, input_attrs[0].size_with_stride * sizeof(char));
173 // 4.1.2 Update input attrs
174 input_attrs[0].index = 0;
175 input_attrs[0].type = RKNN_TENSOR_UINT8;
176 input_attrs[0].size = m_in_height * m_in_width * m_in_channel * sizeof(char);
177 input_attrs[0].fmt = RKNN_TENSOR_NHWC;
178 // TODO -- The efficiency of pass through will be higher, we need adjust the layout of input to
179 // meet the use condition of pass through.
180 input_attrs[0].pass_through = 0;
181 // 4.1.3 Set input buffer
182 rknn_set_io_mem(ctx, input_mems[0], &(input_attrs[0]));
183 // 4.1.4 bind virtual address to rga virtual address
184 g_rga_dst = wrapbuffer_virtualaddr((void *)input_mems[0]->virt_addr, m_in_width, m_in_height,
185 RK_FORMAT_RGB_888);
186
187 // 4.2 Set outputs memory
188 for (int i = 0; i < n_output; ++i) {
189 // 4.2.1 Create output tensor memory, output data type is int8, post_process need int8 data.
190 output_mems[i] = rknn_create_mem(ctx, output_attrs[i].n_elems * sizeof(unsigned char));
191 memset(output_mems[i]->virt_addr, 0, output_attrs[i].n_elems * sizeof(unsigned char));
192 // 4.2.2 Update input attrs
193 output_attrs[i].type = RKNN_TENSOR_INT8;
194 // 4.1.3 Set output buffer
195 rknn_set_io_mem(ctx, output_mems[i], &(output_attrs[i]));
196 }
197 #else
198 void *in_data = malloc(m_in_width * m_in_height * m_in_channel);
199 memset(in_data, 0, m_in_width * m_in_height * m_in_channel);
200 g_rga_dst = wrapbuffer_virtualaddr(in_data, m_in_width, m_in_height, RK_FORMAT_RGB_888);
201 #endif
202
203 created = true;
204
205 LOGI("rknn_init success!");
206
207 return 0;
208 }
209
destroy()210 void destroy() {
211 // LOGI("rknn_destroy!");
212 // release io_mem resource
213 for (int i = 0; i < n_input; ++i) {
214 rknn_destroy_mem(ctx, input_mems[i]);
215 }
216 for (int i = 0; i < n_output; ++i) {
217 rknn_destroy_mem(ctx, output_mems[i]);
218 }
219 rknn_destroy(ctx);
220 }
221
run_yolo(char * inDataRaw,char * y0,char * y1,char * y2)222 bool run_yolo(char *inDataRaw, char *y0, char *y1, char *y2)
223 {
224 int ret;
225 bool status = false;
226 if(!created) {
227 LOGE("run_yolo: init yolo hasn't successful!");
228 return false;
229 }
230
231 #ifdef EVAL_TIME
232 struct timeval start_time, stop_time;
233
234 gettimeofday(&start_time, NULL);
235 #endif
236 g_rga_src = wrapbuffer_virtualaddr((void *)inDataRaw, img_width, img_height,
237 RK_FORMAT_RGBA_8888);
238
239 // convert color format and resize. RGA8888 -> RGB888
240 ret = imresize(g_rga_src, g_rga_dst);
241 if (IM_STATUS_SUCCESS != ret) {
242 LOGE("run_yolo: resize image with rga failed: %s\n", imStrError((IM_STATUS)ret));
243 return false;
244 }
245 #ifdef EVAL_TIME
246 gettimeofday(&stop_time, NULL);
247 LOGI("imresize use %f ms\n", (__get_us(stop_time) - __get_us(start_time)) / 1000);
248 #endif
249
250 #ifdef DEBUG_DUMP
251 // save resized image
252 if (g_inf_count == 5) {
253 char out_img_name[1024];
254 memset(out_img_name, 0, sizeof(out_img_name));
255 sprintf(out_img_name, "/data/user/0/com.rockchip.gpadc.yolodemo/cache/resized_img_%d.rgb", g_inf_count);
256 FILE *fp = fopen(out_img_name, "w");
257 // LOGI("n_elems: %d", input_attrs[0].n_elems);
258 // fwrite(input_mems[0]->virt_addr, 1, input_attrs[0].n_elems * sizeof(unsigned char), fp);
259 // fflush(fp);
260 for (int i = 0; i < input_attrs[0].n_elems; ++i) {
261 fprintf(fp, "%d\n", *((uint8_t *)(g_rga_dst.vir_addr) + i));
262 }
263 fclose(fp);
264 }
265
266 #endif
267
268 #if ZERO_COPY
269 #else
270 rknn_input inputs[1];
271 inputs[0].index = 0;
272 inputs[0].type = RKNN_TENSOR_UINT8;
273 inputs[0].size = m_in_width * m_in_height * m_in_channel;
274 inputs[0].fmt = RKNN_TENSOR_NHWC;
275 inputs[0].pass_through = 0;
276 inputs[0].buf = g_rga_dst.vir_addr;
277 #ifdef EVAL_TIME
278 gettimeofday(&start_time, NULL);
279 #endif
280 rknn_inputs_set(ctx, 1, inputs);
281 #ifdef EVAL_TIME
282 gettimeofday(&stop_time, NULL);
283 LOGI("rknn_inputs_set use %f ms\n", (__get_us(stop_time) - __get_us(start_time)) / 1000);
284 #endif
285 #endif
286
287 #ifdef EVAL_TIME
288 gettimeofday(&start_time, NULL);
289 #endif
290 ret = rknn_run(ctx, nullptr);
291 if(ret < 0) {
292 LOGE("rknn_run fail! ret=%d\n", ret);
293 return false;
294 }
295 #ifdef EVAL_TIME
296 gettimeofday(&stop_time, NULL);
297 LOGI("inference use %f ms\n", (__get_us(stop_time) - __get_us(start_time)) / 1000);
298
299 // outputs format are all NCHW.
300 gettimeofday(&start_time, NULL);
301 #endif
302
303 #if ZERO_COPY
304 memcpy(y0, output_mems[0]->virt_addr, output_attrs[0].n_elems * sizeof(char));
305 memcpy(y1, output_mems[1]->virt_addr, output_attrs[1].n_elems * sizeof(char));
306 memcpy(y2, output_mems[2]->virt_addr, output_attrs[2].n_elems * sizeof(char));
307 #else
308 rknn_output outputs[3];
309 memset(outputs, 0, sizeof(outputs));
310 for (int i = 0; i < 3; ++i) {
311 outputs[i].want_float = 0;
312 }
313 rknn_outputs_get(ctx, 3, outputs, NULL);
314 memcpy(y0, outputs[0].buf, output_attrs[0].n_elems * sizeof(char));
315 memcpy(y1, outputs[1].buf, output_attrs[1].n_elems * sizeof(char));
316 memcpy(y2, outputs[2].buf, output_attrs[2].n_elems * sizeof(char));
317 rknn_outputs_release(ctx, 3, outputs);
318 #endif
319
320 #ifdef EVAL_TIME
321 gettimeofday(&stop_time, NULL);
322 LOGI("copy output use %f ms\n", (__get_us(stop_time) - __get_us(start_time)) / 1000);
323 #endif
324
325 #ifdef DEBUG_DUMP
326 if (g_inf_count == 5) {
327 for (int i = 0; i < n_output; ++i) {
328 char out_path[1024];
329 memset(out_path, 0, sizeof(out_path));
330 sprintf(out_path, "/data/user/0/com.rockchip.gpadc.yolodemo/cache/out_%d.tensor", i);
331 FILE *fp = fopen(out_path, "w");
332 for (int j = 0; j < output_attrs[i].n_elems; ++j) {
333 #if ZERO_COPY
334 fprintf(fp, "%d\n", *((int8_t *)(output_mems[i]->virt_addr) + i));
335 #else
336 fprintf(fp, "%d\n", *((int8_t *)(outputs[i].buf) + i));
337 #endif
338 }
339 fclose(fp);
340 }
341 }
342 if (g_inf_count < 10) {
343 g_inf_count++;
344 }
345 #endif
346
347 status = true;
348
349 // LOGI("run_yolo: end\n");
350
351 return status;
352 }
353
yolo_post_process(char * grid0_buf,char * grid1_buf,char * grid2_buf,int * ids,float * scores,float * boxes)354 int yolo_post_process(char *grid0_buf, char *grid1_buf, char *grid2_buf,
355 int *ids, float *scores, float *boxes) {
356 int ret;
357 if(!created) {
358 LOGE("yolo_post_process: init yolo hasn't successful!");
359 return false;
360 }
361
362 detect_result_group_t detect_result_group;
363 // LOGI("start yolo post.");
364 ret = post_process((int8_t *)grid0_buf, (int8_t *)grid1_buf, (int8_t *)grid2_buf,
365 m_in_height, m_in_width, BOX_THRESH, NMS_THRESH, scale_w, scale_h,
366 out_zps, out_scales, &detect_result_group);
367 if (ret < 0) {
368 LOGE("yolo_post_process: post process failed!");
369 return -1;
370 }
371 // LOGI("deteced %d objects.\n", detect_result_group.count);
372
373 memset(ids, 0, sizeof(int) * OBJ_NUMB_MAX_SIZE);
374 memset(scores, 0, sizeof(float) * OBJ_NUMB_MAX_SIZE);
375 memset(boxes, 0, sizeof(float) * OBJ_NUMB_MAX_SIZE * BOX_LEN);
376
377 int count = detect_result_group.count;
378 for (int i = 0; i < count; ++i) {
379 ids[i] = detect_result_group.results[i].class_id;
380 scores[i] = detect_result_group.results[i].prop;
381 *(boxes+4*i+0) = detect_result_group.results[i].box.left;
382 *(boxes+4*i+1) = detect_result_group.results[i].box.top;
383 *(boxes+4*i+2) = detect_result_group.results[i].box.right;
384 *(boxes+4*i+3) = detect_result_group.results[i].box.bottom;
385 #ifdef DEBUG_DUMP
386 if (g_post_count == 5) {
387 LOGI("result %2d: (%4d, %4d, %4d, %4d), %d\n", i,
388 detect_result_group.results[i].box.left,
389 detect_result_group.results[i].box.top,
390 detect_result_group.results[i].box.right,
391 detect_result_group.results[i].box.bottom,
392 detect_result_group.results->class_id)
393 }
394 if (g_post_count < 10) {
395 g_post_count++;
396 }
397 #endif
398 }
399
400 return count;
401 }
402
colorConvertAndFlip(void * src,int srcFmt,void * dst,int dstFmt,int width,int height,int flip)403 int colorConvertAndFlip(void *src, int srcFmt, void *dst, int dstFmt, int width, int height, int flip) {
404 int ret;
405 // RGA needs to ensure page alignment when using virtual addresses, otherwise it may cause
406 // internal cache flushing errors. Manually modify src/dst buf to force its 4k alignment.
407 // TODO -- convert color format and flip with OpenGL.
408 int src_len = width * height * 3 / 2; // yuv420 buffer length.
409 void *src_ = malloc(src_len + 4096);
410 void *org_src = src_;
411 memset(src_, 0, src_len + 4096);
412 src_ = (void *)((((int64_t)src_ >> 12) + 1) << 12);
413 memcpy(src_, src, src_len);
414 int dst_len = width * height * 4; // rgba buffer length.
415 void *dst_ = malloc(dst_len + 4096);
416 void *org_dst = dst_;
417 memset(dst_, 0, dst_len + 4096);
418 dst_ = (void *)((((int64_t)dst_ >> 12) + 1) << 12);
419 rga_buffer_t rga_src = wrapbuffer_virtualaddr((void *)src_, width, height, srcFmt);
420 rga_buffer_t rga_dst = wrapbuffer_virtualaddr((void *)dst_, width, height, dstFmt);
421
422 if (DO_NOT_FLIP == flip) {
423 // convert color format
424 ret = imcvtcolor(rga_src, rga_dst, rga_src.format, rga_dst.format);
425 } else {
426 // convert color format and flip.
427 ret = imflip(rga_src, rga_dst, flip);
428 }
429
430 if (IM_STATUS_SUCCESS != ret) {
431 LOGE("colorConvertAndFlip failed. Ret: %s\n", imStrError((IM_STATUS)ret));
432 }
433
434 memcpy(dst, dst_, dst_len);
435 free(org_src);
436 free(org_dst);
437
438 return ret;
439 }
440
rknn_app_destory()441 void rknn_app_destory() {
442 LOGI("rknn app destroy.\n");
443 if (g_rga_dst.vir_addr) {
444 free(g_rga_dst.vir_addr);
445 }
446 rknn_destroy(ctx);
447 }
448