1 // Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 /*-------------------------------------------
17 Includes
18 -------------------------------------------*/
19 #include "rknn_api.h"
20
21 #include <float.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/time.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28
29 #define STB_IMAGE_IMPLEMENTATION
30 #include "stb/stb_image.h"
31 #define STB_IMAGE_RESIZE_IMPLEMENTATION
32 #include <stb/stb_image_resize.h>
33
34 #define NPY_SUPPORT 0
35
36 #if NPY_SUPPORT
37 # include "cnpy/cnpy.h"
38 #endif
39
40 /*-------------------------------------------
41 Functions
42 -------------------------------------------*/
getCurrentTimeUs()43 static inline int64_t getCurrentTimeUs()
44 {
45 struct timeval tv;
46 gettimeofday(&tv, NULL);
47 return tv.tv_sec * 1000000 + tv.tv_usec;
48 }
49
rknn_GetTopN(float * pfProb,float * pfMaxProb,uint32_t * pMaxClass,uint32_t outputCount,uint32_t topNum)50 static int rknn_GetTopN(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass, uint32_t outputCount, uint32_t topNum)
51 {
52 uint32_t i, j;
53 uint32_t top_count = outputCount > topNum ? topNum : outputCount;
54
55 for (i = 0; i < topNum; ++i) {
56 pfMaxProb[i] = -FLT_MAX;
57 pMaxClass[i] = -1;
58 }
59
60 for (j = 0; j < top_count; j++) {
61 for (i = 0; i < outputCount; i++) {
62 if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) || (i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) ||
63 (i == *(pMaxClass + 4))) {
64 continue;
65 }
66
67 if (pfProb[i] > *(pfMaxProb + j)) {
68 *(pfMaxProb + j) = pfProb[i];
69 *(pMaxClass + j) = i;
70 }
71 }
72 }
73
74 return 1;
75 }
76
77
dump_tensor_attr(rknn_tensor_attr * attr)78 static void dump_tensor_attr(rknn_tensor_attr* attr)
79 {
80 char dims[128] = {0};
81 for (int i = 0; i < attr->n_dims; ++i) {
82 int idx = strlen(dims);
83 sprintf(&dims[idx], "%d%s", attr->dims[i], (i == attr->n_dims - 1) ? "" : ", ");
84 }
85 printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
86 "zp=%d, scale=%f\n",
87 attr->index, attr->name, attr->n_dims, dims, attr->n_elems, attr->size, get_format_string(attr->fmt),
88 get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
89 }
90
load_file(const char * file_path,size_t * file_size)91 static void* load_file(const char* file_path, size_t* file_size)
92 {
93 FILE* fp = fopen(file_path, "r");
94 if (fp == NULL) {
95 printf("failed to open file: %s\n", file_path);
96 return NULL;
97 }
98
99 fseek(fp, 0, SEEK_END);
100 size_t size = (size_t)ftell(fp);
101 fseek(fp, 0, SEEK_SET);
102
103 void* file_data = malloc(size);
104 if (file_data == NULL) {
105 fclose(fp);
106 printf("failed allocate file size: %zu\n", size);
107 return NULL;
108 }
109
110 if (fread(file_data, 1, size, fp) != size) {
111 fclose(fp);
112 free(file_data);
113 printf("failed to read file data!\n");
114 return NULL;
115 }
116
117 fclose(fp);
118
119 *file_size = size;
120
121 return file_data;
122 }
123
load_bin(const char * filename,void * data,int max_size)124 static int load_bin(const char *filename, void *data, int max_size)
125 {
126
127 FILE *fp;
128 int ret = 0;
129
130 fp = fopen(filename, "rb");
131 if (NULL == fp)
132 {
133 printf("Open file %s failed.\n", filename);
134 return -1;
135 }
136
137 fseek(fp, 0, SEEK_END);
138 int size = ftell(fp);
139
140 if (size != max_size) {
141 printf("file size not match: %d vs %d!\n", size, max_size);
142 fclose(fp);
143 return -1;
144 }
145
146 ret = fseek(fp, 0, SEEK_SET);
147 ret = fread(data, 1, size, fp);
148
149 fclose(fp);
150
151 return ret;
152 }
153
154
load_image(const char * image_path,rknn_tensor_attr * input_attr)155 static unsigned char* load_image(const char* image_path, rknn_tensor_attr* input_attr)
156 {
157 int req_height = 0;
158 int req_width = 0;
159 int req_channel = 0;
160
161 switch (input_attr->fmt) {
162 case RKNN_TENSOR_NHWC:
163 req_height = input_attr->dims[1];
164 req_width = input_attr->dims[2];
165 req_channel = input_attr->dims[3];
166 break;
167 case RKNN_TENSOR_NCHW:
168 req_height = input_attr->dims[2];
169 req_width = input_attr->dims[3];
170 req_channel = input_attr->dims[1];
171 break;
172 default:
173 printf("meet unsupported layout\n");
174 return NULL;
175 }
176
177 int height = 0;
178 int width = 0;
179 int channel = 0;
180
181 unsigned char* image_data = stbi_load(image_path, &width, &height, &channel, req_channel);
182 if (image_data == NULL) {
183 printf("load image failed!\n");
184 return NULL;
185 }
186
187 if (width != req_width || height != req_height) {
188 unsigned char* image_resized = (unsigned char*)STBI_MALLOC(req_width * req_height * req_channel);
189 if (!image_resized) {
190 printf("malloc image failed!\n");
191 STBI_FREE(image_data);
192 return NULL;
193 }
194 if (stbir_resize_uint8(image_data, width, height, 0, image_resized, req_width, req_height, 0, channel) != 1) {
195 printf("resize image failed!\n");
196 STBI_FREE(image_data);
197 return NULL;
198 }
199 STBI_FREE(image_data);
200 image_data = image_resized;
201 }
202
203 return image_data;
204 }
205
206 #if NPY_SUPPORT
load_npy(const char * input_path,rknn_tensor_attr * input_attr,int * input_type,int * input_size)207 static unsigned char* load_npy(const char* input_path, rknn_tensor_attr* input_attr, int* input_type, int* input_size)
208 {
209 int req_height = 0;
210 int req_width = 0;
211 int req_channel = 0;
212
213 printf("Loading %s\n", input_path);
214
215 switch (input_attr->fmt) {
216 case RKNN_TENSOR_NHWC:
217 req_height = input_attr->dims[1];
218 req_width = input_attr->dims[2];
219 req_channel = input_attr->dims[3];
220 break;
221 case RKNN_TENSOR_NCHW:
222 req_height = input_attr->dims[2];
223 req_width = input_attr->dims[3];
224 req_channel = input_attr->dims[1];
225 break;
226 case RKNN_TENSOR_UNDEFINED:
227 break;
228 default:
229 printf("meet unsupported layout\n");
230 return NULL;
231 }
232
233 cnpy_array npy_data;
234
235 bool writable = false;
236 if (cnpy_open(input_path, writable, &npy_data) != CNPY_SUCCESS) {
237 printf("Unable to load file %s\n", input_path);
238 return NULL;
239 }
240
241 int data_bytes = npy_data.raw_data_size - npy_data.data_begin;
242 cnpy_dtype dtype = npy_data.dtype;
243
244 if (dtype == CNPY_I8) {
245 *input_type = RKNN_TENSOR_INT8;
246 } else if (dtype == CNPY_U8) {
247 *input_type = RKNN_TENSOR_UINT8;
248 } else if (dtype == CNPY_F4) {
249 *input_type = RKNN_TENSOR_FLOAT32;
250 }
251
252 // npy shape = NHWC
253 int npy_shape[4] = {1, 1, 1, 1};
254
255 int start = npy_data.n_dim == 4 ? 0 : 1;
256 for (size_t i = 0; i < npy_data.n_dim && i < 4; ++i) {
257 npy_shape[start + i] = npy_data.dims[i];
258 }
259
260 int height = npy_shape[1];
261 int width = npy_shape[2];
262 int channel = npy_shape[3];
263
264 if ((input_attr->fmt != RKNN_TENSOR_UNDEFINED) &&
265 (width != req_width || height != req_height || channel != req_channel)) {
266 printf("npy shape match failed!, (%d, %d, %d) != (%d, %d, %d)\n", height, width, channel, req_height, req_width,
267 req_channel);
268 return NULL;
269 }
270
271 unsigned char* data = (unsigned char*)malloc(data_bytes);
272 if (!data) {
273 return NULL;
274 }
275
276 // TODO: copy
277 memcpy(data, npy_data.raw_data + npy_data.data_begin, data_bytes);
278
279 *input_size = data_bytes;
280
281 return data;
282 }
283
save_npy(const char * output_path,float * output_data,rknn_tensor_attr * output_attr)284 static int save_npy(const char* output_path, float* output_data, rknn_tensor_attr* output_attr)
285 {
286 int size = 1;
287
288 for (uint32_t i = 0; i < output_attr->n_dims; ++i) {
289 size *= output_attr->dims[i];
290 }
291
292 cnpy_array npy_data;
293 cnpy_byte_order byte_order = CNPY_LE; /* little endian */
294 cnpy_dtype dtype = CNPY_F4; /* float */
295 cnpy_flat_order order = CNPY_C_ORDER; /* Fortran (row major) order */
296
297 if (cnpy_create(output_path, byte_order, dtype, order, output_attr->n_dims, (const size_t*)output_attr->dims,
298 &npy_data) != CNPY_SUCCESS) {
299 cnpy_perror("Unable to create file: ");
300 return -1;
301 }
302
303 memcpy(npy_data.raw_data + npy_data.data_begin, (uint8_t*)output_data, sizeof(float) * size);
304
305 /* optional: */
306 if (cnpy_close(&npy_data) != CNPY_SUCCESS) {
307 cnpy_perror("Unable to close file: ");
308 return -1;
309 }
310 return 0;
311 }
312 #endif
313
314 #define MAX_OUTPUT_NUM 4
315 #define TOTAL_RKNN_MODEL_NUM 2
316
317 /*-------------------------------------------
318 Main Functions
319 -------------------------------------------*/
main(int argc,char * argv[])320 int main(int argc, char* argv[])
321 {
322 if (argc < 5) {
323 printf("Usage:%s model_path_a input_path_a model_path_b input_path_b [loop_count] \n", argv[0]);
324 return -1;
325 }
326
327 char* model_path_a = argv[1];
328 char* input_path_a = argv[2];
329 char* model_path_b = argv[3];
330 char* input_path_b = argv[4];
331
332 int loop_count = 1;
333 if (argc > 5) {
334 loop_count = atoi(argv[5]);
335 }
336
337 char *model_path[TOTAL_RKNN_MODEL_NUM];
338 char *input_path[TOTAL_RKNN_MODEL_NUM];
339 rknn_context ctx[TOTAL_RKNN_MODEL_NUM];
340 rknn_mem_size mem_size[TOTAL_RKNN_MODEL_NUM];
341 rknn_input_output_num io_num[TOTAL_RKNN_MODEL_NUM];
342 rknn_tensor_mem* internal_mem[TOTAL_RKNN_MODEL_NUM];
343 rknn_tensor_mem *weight_mems[TOTAL_RKNN_MODEL_NUM];
344 rknn_tensor_attr input_attrs[TOTAL_RKNN_MODEL_NUM][1]; // this demo only support one input
345 rknn_tensor_attr output_attrs[TOTAL_RKNN_MODEL_NUM][MAX_OUTPUT_NUM];
346 rknn_tensor_mem* input_mems[TOTAL_RKNN_MODEL_NUM][1]; // this demo only support one input
347 rknn_tensor_mem* output_mems[TOTAL_RKNN_MODEL_NUM][MAX_OUTPUT_NUM];
348 rknn_tensor_mem* internal_mem_max = NULL;
349 uint32_t max_internal_size = 0;
350 unsigned char* input_data = NULL;
351 rknn_tensor_type input_type = RKNN_TENSOR_UINT8;
352 rknn_tensor_format input_layout = RKNN_TENSOR_NHWC;
353 int ret = 0;
354
355 memset(ctx, 0x00, sizeof(ctx));
356 memset(internal_mem, 0x00, sizeof(internal_mem));
357 memset(input_mems, 0x00, sizeof(input_mems));
358 memset(output_mems, 0x00, sizeof(output_mems));
359
360 model_path[0] = model_path_a;
361 model_path[1] = model_path_b;
362 input_path[0] = input_path_a;
363 input_path[1] = input_path_b;
364
365 for (int n=0; n<TOTAL_RKNN_MODEL_NUM; n++) {
366
367 printf("\033[0;32mLoading %s ... \033[0;0m\n", model_path[n]);
368
369 // Load RKNN Model
370 // Init rknn from model path
371 ret = rknn_init(&ctx[n], model_path[n], 0, RKNN_FLAG_MEM_ALLOC_OUTSIDE, NULL);
372 // ret = rknn_init(&ctx[n], model_path[n], 0, 0, NULL);
373
374 if (ret < 0) {
375 printf("rknn_init fail! ret=%d\n", ret);
376 return -1;
377 }
378
379 // Get sdk and driver version
380 rknn_sdk_version sdk_ver;
381 ret = rknn_query(ctx[n], RKNN_QUERY_SDK_VERSION, &sdk_ver, sizeof(sdk_ver));
382 if (ret != RKNN_SUCC) {
383 printf("rknn_query fail! ret=%d\n", ret);
384 rknn_destroy(ctx[n]);
385 goto out;
386 }
387 printf("rknn_api/rknnrt version: %s, driver version: %s\n", sdk_ver.api_version, sdk_ver.drv_version);
388
389 // Get weight and internal mem size, dma used size
390 ret = rknn_query(ctx[n], RKNN_QUERY_MEM_SIZE, &mem_size[n], sizeof(mem_size[n]));
391 if (ret != RKNN_SUCC) {
392 printf("rknn_query fail! ret=%d\n", ret);
393 return -1;
394 }
395 printf("total weight size: %d, total internal size: %d\n", mem_size[n].total_weight_size, mem_size[n].total_internal_size);
396 printf("total dma used size: %zu\n", (size_t)mem_size[n].total_dma_allocated_size);
397
398
399 // Get Model Input Output Info
400 ret = rknn_query(ctx[n], RKNN_QUERY_IN_OUT_NUM, &io_num[n], sizeof(io_num[n]));
401 if (ret != RKNN_SUCC) {
402 printf("rknn_query fail! ret=%d\n", ret);
403 goto out;
404 }
405 printf("model input num: %d, output num: %d\n", io_num[n].n_input, io_num[n].n_output);
406
407 if (io_num[n].n_output > MAX_OUTPUT_NUM) {
408 printf("Please adjust the value of MAX_OUTPUT_NUM, it is too small for this model\n");
409 return -1;
410 };
411
412 printf("input tensors:\n");
413 memset(input_attrs[n], 0, io_num[n].n_input * sizeof(rknn_tensor_attr));
414 for (uint32_t i = 0; i < io_num[n].n_input; i++) {
415 input_attrs[n][i].index = i;
416 // query info
417 ret = rknn_query(ctx[n], RKNN_QUERY_INPUT_ATTR, &(input_attrs[n][i]), sizeof(rknn_tensor_attr));
418 if (ret < 0) {
419 printf("rknn_init error! ret=%d\n", ret);
420 goto out;
421 }
422 dump_tensor_attr(&input_attrs[n][i]);
423 }
424
425 printf("output tensors:\n");
426 memset(output_attrs[n], 0, io_num[n].n_output * sizeof(rknn_tensor_attr));
427 for (uint32_t i = 0; i < io_num[n].n_output; i++) {
428 output_attrs[n][i].index = i;
429 // query info
430 ret = rknn_query(ctx[n], RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[n][i]), sizeof(rknn_tensor_attr));
431 if (ret != RKNN_SUCC) {
432 printf("rknn_query fail! ret=%d\n", ret);
433 goto out;
434 }
435 dump_tensor_attr(&output_attrs[n][i]);
436 }
437
438 // Get custom string
439 rknn_custom_string custom_string;
440 ret = rknn_query(ctx[n], RKNN_QUERY_CUSTOM_STRING, &custom_string, sizeof(custom_string));
441 if (ret != RKNN_SUCC) {
442 printf("rknn_query fail! ret=%d\n", ret);
443 goto out;
444 }
445 printf("custom string: %s\n", custom_string.string);
446
447 // check max max_internal_size
448
449 if (max_internal_size < mem_size[n].total_internal_size) {
450 max_internal_size = mem_size[n].total_internal_size;
451 }
452 }
453
454 printf("\033[0;32mMax internal size %d \033[0;0m\n", max_internal_size);
455
456 // Allocate internal memory in outside
457 internal_mem_max = rknn_create_mem(ctx[0], max_internal_size);
458
459 for (int n=0; n<TOTAL_RKNN_MODEL_NUM; n++) {
460 internal_mem[n] = rknn_create_mem_from_fd(ctx[n], internal_mem_max->fd,
461 internal_mem_max->virt_addr, mem_size[n].total_internal_size, 0);
462 ret = rknn_set_internal_mem(ctx[n], internal_mem[n]);
463 if (ret < 0)
464 {
465 printf("rknn_set_internal_mem fail! ret=%d\n", ret);
466 goto out;
467 }
468
469 printf("internal cma info: virt = %p, phy=0x%lx, fd =%d, size=%d\n", internal_mem[n]->virt_addr, internal_mem[n]->phys_addr, internal_mem[n]->fd, internal_mem[n]->size);
470
471 // 使用rknn_create_mem作为分配器, 分配和设置每个模型的外部weight内存
472 weight_mems[n] = rknn_create_mem(ctx[n], mem_size[n].total_weight_size);
473 rknn_set_weight_mem(ctx[n], weight_mems[n]);
474 }
475
476 for (int n = 0; n < TOTAL_RKNN_MODEL_NUM; n++)
477 {
478 // Create input tensor memory
479 // default input type is int8 (normalize and quantize need compute in outside)
480 // if set uint8, will fuse normalize and quantize to npu
481 input_attrs[n][0].type = input_type;
482 // default fmt is NHWC, npu only support NHWC in zero copy mode
483 input_attrs[n][0].fmt = input_layout;
484
485 input_mems[n][0] = rknn_create_mem(ctx[n], input_attrs[n][0].size_with_stride);
486
487 // Set input tensor memory
488 ret = rknn_set_io_mem(ctx[n], input_mems[n][0], &input_attrs[n][0]);
489 if (ret < 0)
490 {
491 printf("rknn_set_io_mem fail! ret=%d\n", ret);
492 goto out;
493 }
494
495 // Create output tensor memory
496 for (uint32_t i = 0; i < io_num[n].n_output; ++i) {
497 output_mems[n][i] = rknn_create_mem(ctx[n], output_attrs[n][i].n_elems * sizeof(float));
498 }
499
500 // Set output tensor memory
501 for (uint32_t i = 0; i < io_num[n].n_output; ++i) {
502 // set output memory and attribute
503 output_attrs[n][i].type = RKNN_TENSOR_FLOAT32;
504 output_attrs[n][i].fmt = RKNN_TENSOR_NCHW;
505 ret = rknn_set_io_mem(ctx[n], output_mems[n][i], &output_attrs[n][i]);
506 if (ret < 0) {
507 printf("rknn_set_io_mem fail! ret=%d\n", ret);
508 goto out;
509 }
510 }
511 }
512
513 // Copy input data to input tensor memory
514 for (int n=0; n<TOTAL_RKNN_MODEL_NUM; n++) {
515 // Load image
516 if (strstr(input_path[n], ".npy")) {
517 #if NPY_SUPPORT
518 int input_size = 0;
519 input_data = load_npy(input_path[n], &input_attrs[n][0], (int*)&input_type, &input_size);
520 #else
521 return -1;
522 #endif
523 } else {
524 input_data = load_image(input_path[n], &input_attrs[n][0]);
525 }
526 if (!input_data) {
527 printf("Load %s fail!\n", input_path[n]);
528 goto out;
529 }
530
531 int height = input_attrs[n][0].dims[1];
532 int width = input_attrs[n][0].dims[2];
533 int channel = input_attrs[n][0].dims[3];
534 int stride = input_attrs[n][0].w_stride;
535
536 // TODO, you must resize the image if the size of input image don't match the input shape
537 if (width == stride)
538 {
539 memcpy((char *)(input_mems[n][0]->virt_addr) + input_mems[n][0]->offset, input_data, input_attrs[n][0].dims[2] * input_attrs[n][0].dims[1] * input_attrs[n][0].dims[3]);
540 }
541 else
542 {
543 // copy from src to dst with stride
544 uint8_t* src_ptr = input_data;
545 uint8_t* dst_ptr = (uint8_t*)input_mems[n][0]->virt_addr+input_mems[n][0]->offset;
546 // width-channel elements
547 int src_wc_elems = width * channel;
548 int dst_wc_elems = stride * channel;
549 for (int h = 0; h < height; ++h) {
550 memcpy(dst_ptr, src_ptr, src_wc_elems);
551 src_ptr += src_wc_elems;
552 dst_ptr += dst_wc_elems;
553 }
554 }
555
556 STBI_FREE(input_data);
557 }
558
559 // Run
560 printf("Begin perf ...\n");
561 for (int n=0; n<TOTAL_RKNN_MODEL_NUM; n++) {
562 printf("==== %s ====\n", model_path[n]);
563 for (int i = 0; i < loop_count; ++i) {
564 int64_t start_us = getCurrentTimeUs();
565 ret = rknn_run(ctx[n], NULL);
566 int64_t elapse_us = getCurrentTimeUs() - start_us;
567 if (ret < 0) {
568 printf("rknn run error %d\n", ret);
569 goto out;
570 }
571
572 printf("%4d: Elapse Time = %.2fms, FPS = %.2f\n", i, elapse_us / 1000.f, 1000.f * 1000.f / elapse_us);
573 }
574
575 // Get top 5
576 uint32_t topNum = 5;
577 for (uint32_t i = 0; i < io_num[n].n_output; i++) {
578 uint32_t MaxClass[topNum];
579 float fMaxProb[topNum];
580
581 float* buffer = (float*)output_mems[n][i]->virt_addr;
582 uint32_t sz = output_attrs[n][i].n_elems;
583 int top_count = sz > topNum ? topNum : sz;
584
585 rknn_GetTopN(buffer, fMaxProb, MaxClass, sz, topNum);
586
587 printf("---- Top%d ----\n", top_count);
588 for (int j = 0; j < top_count; j++) {
589 printf("%8.6f - %d\n", fMaxProb[j], MaxClass[j]);
590 }
591 }
592 }
593
594
595 out:
596
597 // free all objects
598 if (internal_mem_max) {
599 rknn_destroy_mem(ctx[0], internal_mem_max);
600 }
601
602 for (int n=0; n<TOTAL_RKNN_MODEL_NUM; n++) {
603 // Destroy rknn memory
604
605 if (ctx[n]) {
606 if (input_mems[n][0]) {
607 rknn_destroy_mem(ctx[n], input_mems[n][0]);
608 }
609
610 for (uint32_t i = 0; i < io_num[n].n_output; ++i) {
611 if (output_mems[n][i]) {
612 rknn_destroy_mem(ctx[n], output_mems[n][i]);
613 }
614 }
615
616 if (internal_mem[n]) {
617 rknn_destroy_mem(ctx[n], internal_mem[n]);
618 }
619
620 if (weight_mems[n])
621 {
622 rknn_destroy_mem(ctx[n], weight_mems[n]);
623 }
624
625 // destroy
626 rknn_destroy(ctx[n]);
627 }
628 }
629
630
631 return 0;
632
633 }
634