YOLOv5-Lite 树莓派实时 | 更少的参数、更高的精度、更快的检测速度(C++部署分享)（二）-阿里云开发者社区

4Tengine部署YOLOv5-Lite

依照顺序调用Tengine核心API如下：

1. init_tengine

初始化Tengine，该函数在程序中只要调用一次即可。

2. create_graph

创建Tengine计算图。

3. prerun_graph

预运行，准备计算图推理所需资源。设置大小核，核个数、核亲和性、数据精度都在这里。

struct options
{
  int num_thread;//核个数设置，
  int cluster;//大小核设置，可选TENGINE_CLUSTER_[ALL,BIG，MEDIUM，LITTLE]
  int precision;//精度设置，TENGINE_MODE_[FP32,FP16,HYBRID_INT8,UINT8,INT8]
  uint64_t affinity;//核亲和性掩码，绑定具体核，
};

4. run_graph

启动Tengine计算图推理。

5. postrun_graph

停止运行graph，并释放graph占据的资源。

6. destroy_graph

销毁graph。

1、图像自适应缩放

在训练阶段，比如网络输入的尺寸608×608，但我数据的尺寸是大小不一的，一般方法是直接同一缩放到标准尺寸，然后填充黑边，如下图所示：

但如果填充的比较多，则存在信息冗余，影响推理速度。Yolov5在推理阶段，采用缩减黑边的方式，来提高推理的速度。在代码datasets.py的letterbox函数中进行了修改，对原始图像自适应的添加最少的黑边。eg：“比如我1000×800的图片不是直接缩放到608×608的大小，而是计算608/1000=0.608 然后缩放至608×486的大小，然后计算608-486=122 然后np.mod(122，32)取余数得到26，再平均成13填充到图片高度两端，最后是608×512。”

def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)
    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
    dw /= 2  # divide padding into 2 sides
    dh /= 2
    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)

C++版本如下：

void get_input_data_focus(const char* image_file, float* input_data, int img_h, int img_w, const float* mean, const float* scale)
{
    cv::Mat sample = cv::imread(image_file, 1);
    cv::Mat img;
    const int target_size = 640;
    int imge_w = img.cols;
    int imge_h = img.rows;
    int w = imge_w;
    int h = imge_h;
    float scale_im = 1.f;
    if (w > h)
    {
        scale_im = (float)target_size / w;
        w = target_size;
        h = h * scale_im;
    }
    else
    {
        scale_im = (float)target_size / h;
        h = target_size;
        w = w * scale_im;
    }
    cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
    cv::resize(img, img, cv::Size(w, h));
    // pad to target_size rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    cv::Mat in_pad;
    cv::copy_make_border(img, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, cv::BORDER_CONSTANT, 114.f);
    img.convertTo(img, CV_32FC3);
    float* img_data = (float*)img.data;
    /* nhwc to nchw */
    for (int h = 0; h < img_h; h++)
    {
        for (int w = 0; w < img_w; w++)
        {
            for (int c = 0; c < 3; c++)
            {
                int in_index = h * img_w * 3 + w * 3 + c;
                int out_index = c * img_h * img_w + h * img_w + w;
                input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];
            }
        }
    }
}

2、模型加载和推理

/* set runtime options */
    struct options opt;
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)
    {
        fprintf(stderr, "Initial tengine failed.\n");
        return -1;
    }
    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
    /* create graph, load tengine model xxx.tmfile */
    graph_t graph = create_graph(nullptr, "tengine", model_file);
    if (graph == nullptr)
    {
        fprintf(stderr, "Create graph failed.\n");
        return -1;
    }

3 获取推理结果

/* yolov5 postprocess */
    // 0: 1, 3, 20, 20, 85
    // 1: 1, 3, 40, 40, 85
    // 2: 1, 3, 80, 80, 85
    tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
    tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
    tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);
    float* p8_data = (float*)get_tensor_buffer(p8_output);
    float* p16_data = (float*)get_tensor_buffer(p16_output);
    float* p32_data = (float*)get_tensor_buffer(p32_output);
    /* postprocess */
    const float prob_threshold = 0.55;
    const float nms_threshold = 0.5;
    std::vector<Object> proposals;
    std::vector<Object> objects8;
    std::vector<Object> objects16;
    std::vector<Object> objects32;
    std::vector<Object> objects;
    generate_proposals(32, p32_data, prob_threshold, objects32, letterbox_cols, letterbox_rows);
    proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    generate_proposals(16, p16_data, prob_threshold, objects16, letterbox_cols, letterbox_rows);
    proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    generate_proposals(8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows);
    proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    qsort_descent_inplace(proposals);
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

4 后处理

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
    picked.clear();
    const int n = faceobjects.size();
    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }
    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];
        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];
            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }
        if (keep)
            picked.push_back(i);
    }
}
static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects,
                               int letterbox_cols, int letterbox_rows)
{
    //static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};
    static float anchors[18] = {10, 13, 16, 30, 33, 23,
                                30, 61, 62, 45, 59, 119,
                                116, 90, 156, 198, 373, 326};
    int anchor_num = 3;
    int feat_w = letterbox_cols / stride;
    int feat_h = letterbox_rows / stride;
    int cls_num = 80;
    int anchor_group;
    if (stride == 8)
        anchor_group = 1;
    if (stride == 16)
        anchor_group = 2;
    if (stride == 32)
        anchor_group = 3;
    for (int h = 0; h <= feat_h - 1; h++)
    {
        for (int w = 0; w <= feat_w - 1; w++)
        {
            for (int a = 0; a <= anchor_num - 1; a++)
            {
                //process cls score
                int class_index = 0;
                float class_score = -FLT_MAX;
                for (int s = 0; s <= cls_num - 1; s++)
                {
                    float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5];
                    if (score > class_score)
                    {
                        class_index = s;
                        class_score = score;
                    }
                }
                //process box score
                float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4];
                float final_score = sigmoid(box_score) * sigmoid(class_score);
                if (final_score >= prob_threshold)
                {
                    int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5);
                    float dx = sigmoid(feat[loc_idx + 0]);
                    float dy = sigmoid(feat[loc_idx + 1]);
                    float dw = sigmoid(feat[loc_idx + 2]);
                    float dh = sigmoid(feat[loc_idx + 3]);
                    float pred_cx = (dx * 2.0f - 0.5f + w) * stride;
                    float pred_cy = (dy * 2.0f - 0.5f + h) * stride;
                    float anchor_w = anchors[(anchor_group - 1) * 6 + a * 2 + 0];
                    float anchor_h = anchors[(anchor_group - 1) * 6 + a * 2 + 1];
                    float pred_w = dw * dw * 4.0f * anchor_w;
                    float pred_h = dh * dh * 4.0f * anchor_h;
                    float x0 = pred_cx - pred_w * 0.5f;
                    float y0 = pred_cy - pred_h * 0.5f;
                    float x1 = pred_cx + pred_w * 0.5f;
                    float y1 = pred_cy + pred_h * 0.5f;
                    Object obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0;
                    obj.rect.height = y1 - y0;
                    obj.label = class_index;
                    obj.prob = final_score;
                    objects.push_back(obj);
                }
            }
        }
    }
}

5 绘图

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"};
    cv::Mat image = bgr.clone();
    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];
        fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x,
                obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 2, &baseLine);
        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;
        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);
        cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5,
                    cv::Scalar(0, 0, 0));
    }
    cv::imwrite("yolov5s_out.jpg", image);
}
/* yolov5 draw the result */
    float scale_letterbox;
    int resize_rows;
    int resize_cols;
    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
    {
        scale_letterbox = letterbox_rows * 1.0 / img.rows;
    }
    else
    {
        scale_letterbox = letterbox_cols * 1.0 / img.cols;
    }
    resize_cols = int(scale_letterbox * img.cols);
    resize_rows = int(scale_letterbox * img.rows);
    int tmp_h = (letterbox_rows - resize_rows) / 2;
    int tmp_w = (letterbox_cols - resize_cols) / 2;
    float ratio_x = (float)img.rows / resize_rows;
    float ratio_y = (float)img.cols / resize_cols;
    int count = picked.size();
    fprintf(stderr, "detection num: %d\n", count);
    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];
        float x0 = (objects[i].rect.x);
        float y0 = (objects[i].rect.y);
        float x1 = (objects[i].rect.x + objects[i].rect.width);
        float y1 = (objects[i].rect.y + objects[i].rect.height);
        x0 = (x0 - tmp_w) * ratio_x;
        y0 = (y0 - tmp_h) * ratio_y;
        x1 = (x1 - tmp_w) * ratio_x;
        y1 = (y1 - tmp_h) * ratio_y;
        x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f);
        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }
    draw_objects(img, objects);
    /* release tengine */
    postrun_graph(graph);
    destroy_graph(graph);
    release_tengine();

6 可视化推理结果

5YOLOv5-Lite对比结果

6参考

[1].https://blog.csdn.net/weixin_45829462/article/details/119767896

[2].https://github.com/OAID/Tengine

[3].https://github.com/ppogg/YOLOv5-Lite

YOLOv5-Lite 树莓派实时 | 更少的参数、更高的精度、更快的检测速度(C++部署分享)（二）

4Tengine部署YOLOv5-Lite

1. init_tengine

2. create_graph

3. prerun_graph

4. run_graph

5. postrun_graph

6. destroy_graph

1、图像自适应缩放

2、模型加载和推理

3 获取推理结果

4 后处理

5 绘图

6 可视化推理结果

5YOLOv5-Lite对比结果

6参考

热门文章

最新文章

相关课程

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

YOLOv5-Lite 树莓派实时 | 更少的参数、更高的精度、更快的检测速度(C++部署分享)（二）

4Tengine部署YOLOv5-Lite

1. init_tengine

2. create_graph

3. prerun_graph

4. run_graph

5. postrun_graph

6. destroy_graph

1、图像自适应缩放

2、模型加载和推理

3 获取推理结果

4 后处理

5 绘图

6 可视化推理结果

5YOLOv5-Lite对比结果

6参考

热门文章

最新文章

相关课程

相关电子书

相关实验场景