📿 🎩 👨‍🎤 تحسينات OpenCV عبر الأنظمة الأساسية 👂🏿 🏚️ 🌴

تتكون مكتبة OpenCV من عدة آلاف من الوظائف والخوارزميات. في هذه المقالة ، نريد أن نخبرك عن كيفية توفير مرونة تحسين خوارزميات رؤية الكمبيوتر في OpenCV لمختلف البنى وأنظمة التشغيل وبيئات العمل المختلفة.

سنلقي نظرة على التجريد parallel_for_والآلية العالمية الجوهرية وكيفية إعادة استخدامها في مشروعك.

تستند المقالة إلى محاضرة قدمها فريق OpenCV كجزء من معسكر التحسين الشتوي لشركة Intel في عام 2020. يتوفر أيضًا فيديو لهذه المحاضرة.

Opencv

يصادف هذا العام الذكرى السنوية العشرين لـ OpenCV. من الممكن أن تكون بعض أقسام رمز المكتبة أقدم من قراءة هذه المقالة :) ولكن على الرغم من العمر ، فإن المكتبة تتحسن باستمرار عن طريق إضافة خوارزميات جديدة وتحسين الخوارزميات الموجودة للأجهزة المحدثة والبنى الجديدة وأساليب التجميع والنشر الجديدة.

بعض الحقائق:

يتم تنزيل OpenCV أكثر من 3 ملايين مرة في السنة (مع مراعاة PyPI + SourceForge فقط).
C++, Python, Java, JavaScript, MATLAB, PHP, Go, C#,… , , , JavaScript, C++, JS ( ).
. core , , OpenCV, , .

OpenCV , , #ifdef - . , ?

, , OpenCV , . ( ) — , , , .. cv::Mat — :

cv::Mat mat(480, 640, CV_8UC3);
int rows     = mat.rows;        // 480
int cols     = mat.cols;        // 640
int channels = mat.channels();  // 3
uint8_t* data = mat.ptr<uint8_t>();

// or

std::vector<float> myData(10*11);
cv::Mat myMat(10, 11, CV_32FC1, myData.data());

cv::Mat:

//      | -1  0  +1 |
// Gx = | -1  0  +1 | * A
//      | -1  0  +1 |
void prewitt_x(const Mat& src, Mat& dst) {
    CV_Assert(src.type() == CV_8UC1);
    Mat bsrc;
    copyMakeBorder(src, bsrc, 1, 1, 1, 1, BORDER_REPLICATE);
    dst.create(src.size(), CV_8UC1);
    for (int y = 0; y < dst.rows; ++y)
        for (int x = 0; x < dst.cols; ++x) {
            dst.at<uchar>(y, x) = bsrc.at<uchar>(y  , x+2) - bsrc.at<uchar>(y  , x) +
                                  bsrc.at<uchar>(y+1, x+2) - bsrc.at<uchar>(y+1, x) +
                                  bsrc.at<uchar>(y+2, x+2) - bsrc.at<uchar>(y+2, x);
        }
}

3x3 . , , 1 .

1920x1080 6.13 .

. , , , , , , . , OpenCV , .

parallel_for_

OpenCV- parallel_for_ — , , OS . :

Intel Threading Building Blocks (TBB)
OpenMP
Apple GCD
Windows RT concurrency
Windows concurrency
Pthreads

— , :

parallel_for_(Range(0, src.rows), [&](const Range& range) {
  for (int y = range.start; y < range.end; ++y)
      for (int x = 0; x < dst.cols; ++x) {
          dst.at<uchar>(y, x) = bsrc.at<uchar>(y  , x+2) - bsrc.at<uchar>(y  , x) +
                                bsrc.at<uchar>(y+1, x+2) - bsrc.at<uchar>(y+1, x) +
                                bsrc.at<uchar>(y+2, x+2) - bsrc.at<uchar>(y+2, x);
      }
});

2.20ms (x2.78). .

Universal Intrinsics

— OpenCV . , . OpenCV, :

#include <opencv2/core/hal/intrin.hpp>

void process(int* data, int len) {
    const cv::v_int32x4 twos = cv::v_setall_s32(2);
    int i = 0;
    for (; i <= len - 4; i += 4) {
        cv::v_int32x4 b0 = cv::v_load(&data[i]);
        b0 *= twos;
        v_store(&data[i], b0);
    }
}

, 2. :
, , , — , .
*, +, -. , .

, vx_load : v_uint8, v_int32 . nlanes , v_uint8::nlanes.

, :

AVX / SSE (x86)
NEON (ARM)
VSX (PowerPC)
MSA (MIPS)
WASM (JavaScript)

, . , . OpenCV API .

, — . , . , OpenCV . , VSX (OpenCV 3.3.1), MSA WASM (OpenCV 4.1.2).

Universal Intrinsics:

parallel_for_(Range(0, src.rows), [&](const Range& range) {
    for (int y = range.start; y < range.end; ++y) {
        const uint8_t* psrc0 = bsrc.ptr(y);
        const uint8_t* psrc1 = bsrc.ptr(y + 1);
        const uint8_t* psrc2 = bsrc.ptr(y + 2);
        uint8_t* pdst = dst.ptr(y);
        int x = 0;
        for (; x <= dst.cols - v_uint8::nlanes; x += v_uint8::nlanes) {
            v_uint8 res = v_add_wrap(v_sub_wrap(vx_load(psrc0+x+2), vx_load(psrc0+x)),
                          v_add_wrap(v_sub_wrap(vx_load(psrc1+x+2), vx_load(psrc1+x)),
                                     v_sub_wrap(vx_load(psrc2+x+2), vx_load(psrc2+x)) ));

            v_store(pdst + x, res);
        }
        for (; x < dst.cols; ++x) {
            pdst[x] = psrc0[x + 2] - psrc0[x] +
                      psrc1[x + 2] - psrc1[x] +
                      psrc2[x + 2] - psrc2[x];
        }
    }
});

, , .

, , , . , Halide — :

<*  *  *  *> *  *  *  *  *  *

 *  *  *  * <*  *  *  *> *  *

 *  *  *  *  *  * <*  *  *  *>

, , -, — , in-place .

1920x1080 0.2ms, 23.5 6.13 ms.

, , :

for (int y = range.start; y < range.end; ++y) {
    const uint8_t* psrc0 = bsrc.ptr(y);
    const uint8_t* psrc1 = bsrc.ptr(y + 1);
    const uint8_t* psrc2 = bsrc.ptr(y + 2);
    uint8_t* pdst = dst.ptr(y);
    int x = 0;
#if CV_AVX512_SKX
    if (CV_CPU_HAS_SUPPORT_AVX512_SKX) {
        for (; x <= dst.cols - 64; x += 64) {
            __m512i vsrc0 = _mm512_sub_epi8(_mm512_loadu_epi8(psrc0 + x + 2), _mm512_loadu_epi8(psrc0 + x));
            __m512i vsrc1 = _mm512_sub_epi8(_mm512_loadu_epi8(psrc1 + x + 2), _mm512_loadu_epi8(psrc1 + x));
            __m512i vsrc2 = _mm512_sub_epi8(_mm512_loadu_epi8(psrc2 + x + 2), _mm512_loadu_epi8(psrc2 + x));
            _mm512_storeu_epi8(pdst + x, _mm512_add_epi8(vsrc0, _mm512_add_epi8(vsrc1, vsrc2)));
        }
    }
#endif
#if CV_AVX2
    if (CV_CPU_HAS_SUPPORT_AVX2) {
        for (; x <= dst.cols - 32; x += 32) {
            __m256i vsrc0 = _mm256_sub_epi8(_mm256_loadu_si256(psrc0 + x + 2), _mm256_loadu_si256(psrc0 + x));
            __m256i vsrc1 = _mm256_sub_epi8(_mm256_loadu_si256(psrc1 + x + 2), _mm256_loadu_si256(psrc1 + x));
            __m256i vsrc2 = _mm256_sub_epi8(_mm256_loadu_si256(psrc2 + x + 2), _mm256_loadu_si256(psrc2 + x));
            _mm256_storeu_si256(pdst + x, _mm256_add_epi8(vsrc0, _mm256_add_epi8(vsrc1, vsrc2)));
        }
    }
#endif
#if CV_SSE2
    for (; x <= dst.cols - 16; x += 16) {
        __m128i vsrc0 = _mm_sub_epi8(_mm_loadu_si128((__m128i const*)(psrc0 + x + 2)), _mm_loadu_si128((__m128i const*)(psrc0 + x)));
        __m128i vsrc1 = _mm_sub_epi8(_mm_loadu_si128((__m128i const*)(psrc1 + x + 2)), _mm_loadu_si128((__m128i const*)(psrc1 + x)));
        __m128i vsrc2 = _mm_sub_epi8(_mm_loadu_si128((__m128i const*)(psrc2 + x + 2)), _mm_loadu_si128((__m128i const*)(psrc2 + x)));
        _mm_storeu_si128(pdst + x, _mm_add_epi8(vsrc0, _mm_add_epi8(vsrc1, vsrc2)));
    }
#elif CV_NEON
    for (; x <= dst.cols - 16; x += 16) {
        uint8x16_t vsrc0 = vsubq_u8(vld1q_u8(psrc0 + x + 2), vld1q_u8(psrc0 + x));
        uint8x16_t vsrc1 = vsubq_u8(vld1q_u8(psrc1 + x + 2), vld1q_u8(psrc1 + x));
        uint8x16_t vsrc2 = vsubq_u8(vld1q_u8(psrc2 + x + 2), vld1q_u8(psrc2 + x));
        vst1q_u8(pdst + x, vaddq_u8(vsrc0, vaddq_u8(vsrc1, vsrc2)));
    }
#elif CV_VSX
    for (; x <= dst.cols - 16; x += 16) {
        vec_uchar16 vsrc0 = vec_sub(ld(0, psrc0 + x + 2), ld(0, psrc0 + x));
        vec_uchar16 vsrc1 = vec_sub(ld(0, psrc1 + x + 2), ld(0, psrc1 + x));
        vec_uchar16 vsrc2 = vec_sub(ld(0, psrc2 + x + 2), ld(0, psrc2 + x));
        st(vec_add(vsrc0, vec_add(vsrc1, vsrc2)), 0, pdst + x);
    }
#elif CV_MSA
    for (; x <= dst.cols - 16; x += 16) {
        v16u8 vsrc0 = msa_subq_u8(msa_ld1q_u8(psrc0 + x + 2), msa_ld1q_u8(psrc0 + x));
        v16u8 vsrc1 = msa_subq_u8(msa_ld1q_u8(psrc1 + x + 2), msa_ld1q_u8(psrc1 + x));
        v16u8 vsrc2 = msa_subq_u8(msa_ld1q_u8(psrc2 + x + 2), msa_ld1q_u8(psrc2 + x));
        msa_st1q_u8(pdst + x, msa_addq_u8(vsrc0, msa_addq_u8(vsrc1, vsrc2)));
    }
#elif CV_WASM
    for (; x <= dst.cols - 16; x += 16) {
        v128_t vsrc0 = wasm_u8x16_sub(wasm_v128_load(psrc0 + x + 2), wasm_v128_load(psrc0 + x));
        v128_t vsrc1 = wasm_u8x16_sub(wasm_v128_load(psrc1 + x + 2), wasm_v128_load(psrc1 + x));
        v128_t vsrc2 = wasm_u8x16_sub(wasm_v128_load(psrc2 + x + 2), wasm_v128_load(psrc2 + x));
        wasm_v128_store(pdst + x, wasm_u8x16_add(vsrc0, wasm_u8x16_add(vsrc1, vsrc2)));
    }
#endif
    for (; x < dst.cols; ++x) {
        pdst[x] = psrc0[x + 2] - psrc0[x] +
                  psrc1[x + 2] - psrc1[x] +
                  psrc2[x + 2] - psrc2[x];
    }
}

( parallel_for_ ):

input:  cv::Mat (single channel, uint8_t)
output: cv::Mat (single channel, int32_t)

out = (Gx)^2 + (Gy)^2, where

      | +1   0  |             |  0  +1 |
Gx =  |  0  -1  | * A,   Gy = | -1   0 | * A

: https://github.com/dkurt/cv_winter_camp_2020

: https://gitpitch.com/dkurt/cv_winter_camp_2020

x2 , .

تحسينات OpenCV عبر الأنظمة الأساسية

Opencv

parallel_for_

Universal Intrinsics

More articles: