vitis ai 提供了C/Python两种接口,两种接口函数名称类似,之后的内容主要由C++/C讲解。
四种API
Vitis AI Library提供了以下四种API:
- Vitis AI Library API_0 based on VART
Vitis AI Library API_1 based on AI Library
Vitis AI Library API_2 based on DpuTask
Vitis AI Library API_3 based on Graph_runner
下面四种API的demo。
VART
如果使用VART(Vitisi AI Runtime Library)进行代码的编写,流程如下:
以resnet50进行图片分类为例,主体代码如下:
//origin:/Vitis-Ai/demo/VART/resnet50/src/main.cc /** * @brief Run DPU Task for ResNet50 * * @param taskResnet50 - pointer to ResNet50 Task * * @return none */ void runResnet50(vart::Runner* runner) { /* Mean value for ResNet50 specified in Caffe prototxt */ vector<string> kinds, images; /* Load all image names.*/ ListImages(baseImagePath, images); if (images.size() == 0) { cerr << "\nError: No images existing under " << baseImagePath << endl; return; } /* Load all kinds words.*/ LoadWords(wordsPath + "words.txt", kinds); if (kinds.size() == 0) { cerr << "\nError: No words exist in file words.txt." << endl; return; } float mean[3] = {104, 107, 123}; /* get in/out tensors and dims*/ auto outputTensors = runner->get_output_tensors(); auto inputTensors = runner->get_input_tensors(); auto out_dims = outputTensors[0]->get_shape(); auto in_dims = inputTensors[0]->get_shape(); auto input_scale = get_input_scale(inputTensors[0]); auto output_scale = get_output_scale(outputTensors[0]); /*get shape info*/ int outSize = shapes.outTensorList[0].size; int inSize = shapes.inTensorList[0].size; int inHeight = shapes.inTensorList[0].height; int inWidth = shapes.inTensorList[0].width; int batchSize = in_dims[0]; std::vector<std::unique_ptr<vart::TensorBuffer>> inputs, outputs; vector<Mat> imageList; int8_t* imageInputs = new int8_t[inSize * batchSize]; float* softmax = new float[outSize]; int8_t* FCResult = new int8_t[batchSize * outSize]; std::vector<vart::TensorBuffer*> inputsPtr, outputsPtr; std::vector<std::shared_ptr<xir::Tensor>> batchTensors; /*run with batch*/ for (unsigned int n = 0; n < images.size(); n += batchSize) { unsigned int runSize = (images.size() < (n + batchSize)) ? (images.size() - n) : batchSize; in_dims[0] = runSize; out_dims[0] = batchSize; for (unsigned int i = 0; i < runSize; i++) { Mat image = imread(baseImagePath + images[n + i]); /*image pre-process*/ Mat image2; //= cv::Mat(inHeight, inWidth, CV_8SC3); resize(image, image2, Size(inHeight, inWidth), 0, 0); for (int h = 0; h < inHeight; h++) { for (int w = 0; w < inWidth; w++) { for (int c = 0; c < 3; c++) { imageInputs[i * inSize + h * inWidth * 3 + w * 3 + c] = (int8_t)((image2.at<Vec3b>(h, w)[c] - mean[c]) * input_scale); } } } imageList.push_back(image); } /* in/out tensor refactory for batch inout/output */ batchTensors.push_back(std::shared_ptr<xir::Tensor>( xir::Tensor::create(inputTensors[0]->get_name(), in_dims, xir::DataType{xir::DataType::XINT, 8u}))); inputs.push_back(std::make_unique<CpuFlatTensorBuffer>( imageInputs, batchTensors.back().get())); batchTensors.push_back(std::shared_ptr<xir::Tensor>( xir::Tensor::create(outputTensors[0]->get_name(), out_dims, xir::DataType{xir::DataType::XINT, 8u}))); outputs.push_back(std::make_unique<CpuFlatTensorBuffer>( FCResult, batchTensors.back().get())); /*tensor buffer input/output */ inputsPtr.clear(); outputsPtr.clear(); inputsPtr.push_back(inputs[0].get()); outputsPtr.push_back(outputs[0].get()); /*run*/ auto job_id = runner->execute_async(inputsPtr, outputsPtr); runner->wait(job_id.first, -1); for (unsigned int i = 0; i < runSize; i++) { cout << "\nImage : " << images[n + i] << endl; /* Calculate softmax on CPU and display TOP-5 classification results */ CPUCalcSoftmax(&FCResult[i * outSize], outSize, softmax, output_scale); TopK(softmax, outSize, 5, kinds); /* Display the impage */ bool quiet = (getenv("QUIET_RUN") != nullptr); if (!quiet) { cv::imshow("Classification of ResNet50", imageList[i]); cv::waitKey(10000); } } imageList.clear(); inputs.clear(); outputs.clear(); } delete[] FCResult; delete[] imageInputs; delete[] softmax; } /** * @brief Entry for runing ResNet50 neural network * * @note Runner APIs prefixed with "dpu" are used to easily program & * deploy ResNet50 on DPU platform. * */ int main(int argc, char* argv[]) { // Check args if (argc != 2) { cout << "Usage of resnet50 demo: ./resnet50 [model_file]" << endl; return -1; } auto graph = xir::Graph::deserialize(argv[1]); auto subgraph = get_dpu_subgraph(graph.get()); CHECK_EQ(subgraph.size(), 1u) << "resnet50 should have one and only one dpu subgraph."; LOG(INFO) << "create running for subgraph: " << subgraph[0]->get_name(); /*create runner*/ auto runner = vart::Runner::create_runner(subgraph[0], "run"); // ai::XdpuRunner* runner = new ai::XdpuRunner("./"); /*get in/out tensor*/ auto inputTensors = runner->get_input_tensors(); auto outputTensors = runner->get_output_tensors(); /*get in/out tensor shape*/ int inputCnt = inputTensors.size(); int outputCnt = outputTensors.size(); TensorShape inshapes[inputCnt]; TensorShape outshapes[outputCnt]; shapes.inTensorList = inshapes; shapes.outTensorList = outshapes; getTensorShape(runner.get(), &shapes, inputCnt, outputCnt); /*run with batch*/ runResnet50(runner.get()); return 0; }
AI Library
当使用的模型在Vitis AI/Model ZOO中时,可以直接复用相应的模型demo,举个例子yolov3:
int main(int argc, char *argv[]) { if (argc < 2) { cerr << "usage: " << argv[0] << " image_file_url " << endl; abort(); } Mat img = cv::imread(argv[2]); if (img.empty()) { cerr << "cannot load " << argv[2] << endl; abort(); } auto yolo = vitis::ai::YOLOv3::create(argv[1], true); // auto yolo = // vitis::ai::YOLOv3::create(xilinx::ai::YOLOV3_VOC_416x416_TF, true); auto results = yolo->run(img); for (auto &box : results.bboxes) { int label = box.label; float xmin = box.x * img.cols + 1; float ymin = box.y * img.rows + 1; float xmax = xmin + box.width * img.cols; float ymax = ymin + box.height * img.rows; if (xmin < 0.) xmin = 1.; if (ymin < 0.) ymin = 1.; if (xmax > img.cols) xmax = img.cols; if (ymax > img.rows) ymax = img.rows; float confidence = box.score; cout << "RESULT: " << label << "\t" << xmin << "\t" << ymin << "\t" << xmax << "\t" << ymax << "\t" << confidence << "\n"; rectangle(img, Point(xmin, ymin), Point(xmax, ymax), Scalar(0, 255, 0), 1, 1, 0); } // imshow("", img); // waitKey(0); imwrite("result.jpg", img); return 0; }
官方提供的模型列表直接参考:https://github.com/Xilinx/Vitis-AI/tree/master/models/AI-Model-Zoo
DPU Task
如果是使用DPU Task,那么可以直接参考yolov3这个例子:
//origin: Vitis-AI/demo/Vitis-AI-Library/samples/ dpu_task/yolov3/demo_yolov3.cpp // The parameters of yolov3_voc, each value could be set as actual needs. //也可以写在TXT中,防止硬编码 const string yolov3_config = { " name: \"yolov3_voc_416\" \n" " model_type : YOLOv3 \n" " yolo_v3_param { \n" " num_classes: 20 \n" " anchorCnt: 3 \n" " conf_threshold: 0.3 \n" " nms_threshold: 0.45 \n" " layer_name: \"81\" \n" " layer_name: \"93\" \n" " layer_name: \"105\" \n" " biases: 10 \n" " biases: 13 \n" " biases: 16 \n" " biases: 30 \n" " biases: 33 \n" " biases: 23 \n" " biases: 30 \n" " biases: 61 \n" " biases: 62 \n" " biases: 45 \n" " biases: 59 \n" " biases: 119 \n" " biases: 116 \n" " biases: 90 \n" " biases: 156 \n" " biases: 198 \n" " biases: 373 \n" " biases: 326 \n" " test_mAP: false \n" " } \n"}; int main(int argc, char* argv[]) { // argv[1]是xmodel的位置 auto kernel_name = argv[1]; // Read image from a path. vector<Mat> imgs; vector<string> imgs_names; for (int i = 2; i < argc; i++) { // image file names. auto img = cv::imread(argv[i]); if (img.empty()) { std::cout << "Cannot load " << argv[i] << std::endl; continue; } imgs.push_back(img); imgs_names.push_back(argv[i]); } if (imgs.empty()) { std::cerr << "No image load success!" << std::endl; abort(); } // Create a dpu task object. auto task = vitis::ai::DpuTask::create(kernel_name); auto batch = task->get_input_batch(0, 0); // Set the mean values and scale values. task->setMeanScaleBGR({0.0f, 0.0f, 0.0f}, {0.00390625f, 0.00390625f, 0.00390625f}); auto input_tensor = task->getInputTensor(0u); CHECK_EQ((int)input_tensor.size(), 1) << " the dpu model must have only one input"; auto width = input_tensor[0].width; auto height = input_tensor[0].height; auto size = cv::Size(width, height); // Create a config and set the correlating data to control post-process. vitis::ai::proto::DpuModelParam config; // Fill all the parameters. auto ok = google::protobuf::TextFormat::ParseFromString(yolov3_config, &config); if (!ok) { cerr << "Set parameters failed!" << endl; abort(); } vector<Mat> inputs; vector<int> input_cols, input_rows; for (long unsigned int i = 0, j = -1; i < imgs.size(); i++) { /* Pre-process Part */ // Resize it if its size is not match. cv::Mat image; input_cols.push_back(imgs[i].cols); input_rows.push_back(imgs[i].rows); if (size != imgs[i].size()) { cv::resize(imgs[i], image, size); } else { image = imgs[i]; } inputs.push_back(image); j++; if (j < batch - 1 && i < imgs.size() - 1) { continue; } // Set the input images into dpu. task->setImageRGB(inputs); /* DPU Runtime */ // Run the dpu. task->run(0u); /* Post-process part */ // Get output. auto output_tensor = task->getOutputTensor(0u); // Execute the yolov3 post-processing. auto results = vitis::ai::yolov3_post_process( input_tensor, output_tensor, config, input_cols, input_rows); /* Print the results */ // Convert coordinate and draw boxes at origin image. for (int k = 0; k < static_cast<int>(inputs.size()); k++) { cout << "batch_index " << k << " " // << "image_name " << imgs_names[i - j + k] << endl; for (auto& box : results[k].bboxes) { int label = box.label; float xmin = box.x * input_cols[k] + 1; float ymin = box.y * input_rows[k] + 1; float xmax = xmin + box.width * input_cols[k]; float ymax = ymin + box.height * input_rows[k]; if (xmin < 0.) xmin = 1.; if (ymin < 0.) ymin = 1.; if (xmax > input_cols[k]) xmax = input_cols[k]; if (ymax > input_rows[k]) ymax = input_rows[k]; float confidence = box.score; cout << "RESULT: " << label << "\t" << xmin << "\t" << ymin << "\t" << xmax << "\t" << ymax << "\t" << confidence << "\n"; rectangle(imgs[i - j + k], Point(xmin, ymin), Point(xmax, ymax), Scalar(0, 255, 0), 1, 1, 0); } imwrite(imgs_names[i - j + k] + "_result.jpg", imgs[i - j + k]); } inputs.clear(); input_cols.clear(); input_rows.clear(); j = -1; } return 0; }
Graph Runer
如果在编译xmodel时,模型被分割成了多个subgraph,那么上面的三种方式就无法运行了,因为上面三种只能运行一个subgraph。
如果真出现了多个subgraph,不妨先升级到vitis ai 1.4,1.3版本有bug。如果1.4版本依旧是多个subgraph,首先检查算子兼容性4、算子兼容性,通过 Vitis-AI/tools/Vitis-AI-Library/cpu_task/ops 可以查询支持的算子。
流程如下:
以车牌识别为例:
//origin:Vitis-AI/demo/Vitis-AI-Library/samples/graph_runner/ platenum_graph_runner/platenum_graph_runner.cpp const std::vector<std::string> charactor_0 = { "unknown", "jing", "hu", "jin", "yu", "ji", "jin", "meng", "liao", "ji", "hei", "su", "zhe", "wan", "min", "gan", "lu", "yu", "e", "xiang", "yue", "gui", "qiong", "chuan", "gui", "yun", "zang", "shan", "gan", "qing", "ning", "xin"}; const std::vector<std::string> charactor_1 = { "unknown", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"}; const std::vector<std::string> charactor_2 = { "unknown", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"}; const std::vector<std::string> color = {"Blue", "Yellow"}; static int get_fix_point(const xir::Tensor* tensor); static std::vector<std::int32_t> get_index_zeros(const xir::Tensor* tensor); static std::vector<cv::Mat> read_images(const std::vector<std::string>& files, size_t batch); static void set_input_image(const cv::Mat& image, void* data1, float scale); static std::vector<std::pair<int, float>> topk(void* data1, size_t size, int K); static size_t find_tensor_index(const char* tensor_name, const std::vector<vart::TensorBuffer*>& outputs); //platenum preprocess static void preprocess_platenum(const std::vector<std::string>& files, const std::vector<vart::TensorBuffer*>& input_tensor_buffers) { auto input_tensor = input_tensor_buffers[0]->get_tensor(); auto batch = input_tensor->get_shape().at(0); auto height = input_tensor->get_shape().at(1); auto width = input_tensor->get_shape().at(2); int fixpos = get_fix_point(input_tensor); float input_fixed_scale = std::exp2f(1.0f * (float)fixpos); auto size = cv::Size(width, height); auto images = read_images(files, batch); CHECK_EQ(images.size(), batch) << "images number be read into input buffer must be equal to batch"; for (int index = 0; index < batch; ++index) { cv::Mat resize_image; if (size != images[index].size()) { cv::resize(images[index], resize_image, size, 0); } else { images[index].copyTo(resize_image); } uint64_t data_in = 0u; size_t size_in = 0u; auto idx = get_index_zeros(input_tensor); idx[0] = (int)index; std::tie(data_in, size_in) = input_tensor_buffers[0]->data(idx); set_input_image(resize_image, (void*)data_in, input_fixed_scale); } } //platenum postprocess static void postprocess_platenum(const std::vector<vart::TensorBuffer*>& output_tensor_buffers) { auto output_tensor = output_tensor_buffers[0]->get_tensor(); auto batch = output_tensor->get_shape().at(0); auto size = output_tensor_buffers.size(); CHECK_EQ(size, 8) << "output_tensor_buffers.size() must be 8"; for (auto i = 1u; i < size; ++i) { CHECK_EQ(output_tensor_buffers[i]->get_tensor()->get_shape().at(0), batch) << "all output_tensor_buffer batch number must be equal"; } std::vector<std::pair<int, float>> ret; for (int batch_index = 0; batch_index < batch; ++batch_index) { for (auto tb_index = 0u; tb_index < size; ++tb_index) { uint64_t data_out = 0u; size_t size_out = 0u; auto idx = get_index_zeros(output_tensor_buffers[tb_index]->get_tensor()); idx[0] = (int)batch_index; std::tie(data_out, size_out) = output_tensor_buffers[tb_index]->data(idx); auto elem_num = output_tensor_buffers[tb_index]->get_tensor()->get_element_num() / batch; auto tb_top1 = topk((void*)data_out, elem_num, 1)[0]; ret.push_back(tb_top1); } } for (int batch_index = 0; batch_index < batch; ++batch_index) { std::string plate_number = ""; std::string plate_color = ""; //output_tensor_buffers maybe out of order, need find correct output_tensor_buffer result by tensor name plate_number += charactor_0[ret[batch_index * size + find_tensor_index("prob1", output_tensor_buffers)].first]; plate_number += charactor_1[ret[batch_index * size + find_tensor_index("prob2", output_tensor_buffers)].first]; plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob3", output_tensor_buffers)].first]; plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob4", output_tensor_buffers)].first]; plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob5", output_tensor_buffers)].first]; plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob6", output_tensor_buffers)].first]; plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob7", output_tensor_buffers)].first]; plate_color = color[ret[batch_index * size + find_tensor_index("prob8", output_tensor_buffers)].first]; std::cout << "batch_index: " << batch_index << std::endl; std::cout << "plate_color: " << plate_color << std::endl; std::cout << "plate_number: " << plate_number << std::endl; } } int main(int argc, char* argv[]) { if (argc < 3) { std::cerr << "usage :" << argv[0] << " <model_name>" << " <image_url> [<image_url> ...]" << std::endl; abort(); } std::string g_xmodel_file = std::string(argv[1]); std::vector<std::string> g_image_files; for (auto i = 2; i < argc; i++) { g_image_files.push_back(std::string(argv[i])); } //create graph runner auto graph = xir::Graph::deserialize(g_xmodel_file); auto attrs = xir::Attrs::create(); auto runner = vitis::ai::GraphRunner::create_graph_runner(graph.get(), attrs.get()); CHECK(runner != nullptr); //get input/output tensor buffers auto input_tensor_buffers = runner->get_inputs(); auto output_tensor_buffers = runner->get_outputs(); //preprocess and fill input preprocess_platenum(g_image_files, input_tensor_buffers); //sync input tensor buffers for (auto& input : input_tensor_buffers) { input->sync_for_write(0, input->get_tensor()->get_data_size() / input->get_tensor()->get_shape()[0]); } //run graph runner auto v = runner->execute_async(input_tensor_buffers, output_tensor_buffers); auto status = runner->wait((int)v.first, -1); CHECK_EQ(status, 0) << "failed to run the graph"; //sync output tensor buffers for (auto output : output_tensor_buffers) { output->sync_for_read(0, output->get_tensor()->get_data_size() / output->get_tensor()->get_shape()[0]); } //postprocess and print platenum result postprocess_platenum(output_tensor_buffers); return 0; } static int get_fix_point(const xir::Tensor* tensor) { CHECK(tensor->has_attr("fix_point")) << "get tensor fix_point error! has no fix_point attr, tensor name is " << tensor->get_name(); return tensor->template get_attr<int>("fix_point"); } static std::vector<std::int32_t> get_index_zeros(const xir::Tensor* tensor) { auto ret = tensor->get_shape(); std::fill(ret.begin(), ret.end(), 0); return ret; } static std::vector<cv::Mat> read_images(const std::vector<std::string>& files, size_t batch) { std::vector<cv::Mat> images(batch); for (auto index = 0u; index < batch; ++index) { const auto& file = files[index % files.size()]; images[index] = cv::imread(file); CHECK(!images[index].empty()) << "cannot read image from " << file; } return images; } static void set_input_image(const cv::Mat& image, void* data1, float scale) { float mean[3] = {128.0, 128.0, 128.0}; signed char* data = (signed char*)data1; for (int h = 0; h < image.rows; h++) { for (int w = 0; w < image.cols; w++) { for (int c = 0; c < 3; c++) { auto image_data = (image.at<cv::Vec3b>(h, w)[c] - mean[c]) * scale; image_data = std::max(std::min(image_data, 127.0f), -128.0f); data[h * image.cols * 3 + w * 3 + c] = (int)image_data; } } } } static std::vector<std::pair<int, float>> topk(void* data1, size_t size, int K) { const float* score = (const float*)data1; auto indices = std::vector<int>(size); std::iota(indices.begin(), indices.end(), 0); std::partial_sort(indices.begin(), indices.begin() + K, indices.end(), [&score](int a, int b) { return score[a] > score[b]; }); auto ret = std::vector<std::pair<int, float>>(K); std::transform( indices.begin(), indices.begin() + K, ret.begin(), [&score](int index) { return std::make_pair(index, score[index]); }); return ret; } static size_t find_tensor_index(const char* tensor_name, const std::vector<vart::TensorBuffer*>& outputs) { auto it = std::find_if(outputs.begin(), outputs.end(), [&tensor_name](const vart::TensorBuffer* tb) { return tb->get_tensor()->get_name() == tensor_name; }); CHECK(it != outputs.end()) << "cannot find tensorbuffer. tensor_name=" << tensor_name; return it - outputs.begin(); }
读取配置文件
以YOLOv3为例:
//origin:/usr/share/vitis_ai_library/models/yolov3/Yolov3.prototxt model { name: "yolov3_voc" kernel { name: "yolov3_voc" mean: 0.0 mean: 0.0 mean: 0.0 scale: 0.00390625 scale: 0.00390625 scale: 0.00390625 } model_type : YOLOv3 yolo_v3_param { num_classes: 20 anchorCnt: 3 layer_name: "59" layer_name: "67" layer_name: "75" conf_threshold: 0.3 nms_threshold: 0.45 biases: 10 biases: 13 biases: 16 biases: 30 biases: 33 biases: 23 biases: 30 biases: 61 biases: 62 biases: 45 biases: 59 biases: 119 biases: 116 biases: 90 biases: 156 biases: 198 biases: 373 biases: 326 test_mAP: false } is_tf: false }
其中,
pre-process
网络前处理主要包括通道均值、缩放、压缩比例:
post-processing
后处理有两种方式解决:一是直接使用官方提供的一些后处理API;二是自己编写后处理模块,并通过dpu_task进行调用。
官方提供的后处理API包括:
• Classification
• Face detection
• Face landmark detection
• SSD detection
• Pose detection
• Semantic segmentation
• Road line detection
• YOLOv3 detection
• YOLOv2 detection
• Openpose detection
• RefineDet detection
• ReID detection
• Multi-task
• Face recognition
• Plate detection
• Plate recognition
• Medical segmentation
• Medical detection
• Face quality
• Hourglass
• Retinaface
• Centerpoint
• Multitaskv3
• Pointpillars_nuscenes
• Rcan
直接参考Vitis-AI/tools/Vitis-AI-Library/xnnp中的相关代码。
如果使用dpu_task,当网络模型运行完毕时,会激活后处理函数: