static check

fix: mismatch faction sign
add cmd
2025-05-08 17:56:22 +08:00 · 2025-03-27 13:43:37 +08:00 · 2025-03-25 17:56:02 +08:00 · 2025-03-10 09:10:32 +08:00 · 2025-03-02 12:47:11 +08:00 · 2025-02-27 21:04:58 +08:00
10 changed files with 450 additions and 294 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.12)

 project(match)
-find_package(OpenCV REQUIRED)
+find_package(OpenCV 4.8 REQUIRED)

 option(ENABLE_OPENMP "enable openmp" OFF)
 if(ENABLE_OPENMP)
@ -24,6 +24,8 @@ add_library(algo SHARED
    serialize.cpp
    privateType.h
    apiExport.h
+    integral.h
+    integral.cpp
 )

 target_include_directories(algo PRIVATE ${OpenCV_INCLUDE_DIRS})
@ -32,6 +34,7 @@ target_compile_options(algo PRIVATE
        $<$<CXX_COMPILER_ID:MSVC>:/W4 /WX /external:W0>
        $<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>: -fPIC -fvisibility=hidden -Wl,--exclude-libs,ALL -Wall -Wextra -Wpedantic -Wmisleading-indentation -Wunused -Wuninitialized -Wshadow -Wconversion -Werror>
        $<$<AND:$<CXX_COMPILER_ID:Clang>,$<STREQUAL:${CMAKE_SYSTEM_NAME},Windows>>:/W4 /WX /external:W0>
+        #$<$<AND:$<CXX_COMPILER_ID:Clang>,$<STREQUAL:${CMAKE_SYSTEM_NAME},Darwin>>: -fPIC -fvisibility=hidden -Wall -Wextra -Wpedantic -Wmisleading-indentation -Wunused -Wuninitialized -Wshadow -Wconversion -Werror>
        $<$<STREQUAL:${CMAKE_SYSTEM_PROCESSOR},loongarch64>:-mlsx>
 )
 target_compile_definitions(algo PUBLIC API_EXPORTS
--- a/README.md
+++ b/README.md
@ -1,11 +1,14 @@
 # Template match with gray model(ncc)

+## Note: branch [feature-rotate-model](https://github.com/SurfaceMan/gray_match/tree/feature-rotate-model) method 2x faster at matching!
+
 ## highlights:
- 1. original code based [Fastest_Image_Pattern_Matching](https://github.com/DennisLiu1993/Fastest_Image_Pattern_Matching), you can checkout tag [v1.0](https://github.com/SurfaceMan/gray_match/releases/tag/v1.0) for more details.
+ 1. original code based [Fastest_Image_Pattern_Matching](https://github.com/DennisLiu1993/Fastest_Image_Pattern_Matching), you can check out tag [v1.0](https://github.com/SurfaceMan/gray_match/releases/tag/v1.0) for more details.
 2. refactor simd match process with opencv [Universal intrinsics](https://docs.opencv.org/4.x/df/d91/group__core__hal__intrin.html), have be tested on x86_64(sse),arm(neon),LoongArch(lsx).
 3. support model save/load as binary file
 4. provide pure c interface
 5. support openmp
+ 6. position with 3x3 subpixel interpolation
 
 ## usage:
 all you need can be found in [main.cpp](main.cpp)
--- a/TODO.md
+++ b/TODO.md
@ -0,0 +1 @@
+1. opencv intrinsics api [changed](https://github.com/opencv/opencv/pull/24371) since 4.9, need refactor 
--- a/grayMatch.cpp
+++ b/grayMatch.cpp
@ -211,8 +211,10 @@ cv::Size computeRotationSize(const cv::Size &dstSize, const cv::Size &templateSi
 }

 #ifdef CV_SIMD
+#include "integral.h"
+
 void matchTemplateSimd(const cv::Mat &src, const cv::Mat &templateImg, cv::Mat &result) {
-    result = cv::Mat(src.size() - templateImg.size() + cv::Size(1, 1), CV_32FC1);
+    result.create(src.size() - templateImg.size() + cv::Size(1, 1), CV_32FC1);

    auto       *resultStart = result.ptr<float>();
    const auto  resultStep  = result.step[ 0 ] / result.step[ 1 ];
@ -220,195 +222,50 @@ void matchTemplateSimd(const cv::Mat &src, const cv::Mat &templateImg, cv::Mat &
    const auto  srcStep     = src.step[ 0 ];
    const auto *temStart    = templateImg.ptr<uchar>();
    const auto  temStep     = templateImg.step[ 0 ];
+
+    std::vector<cv::v_uint32> tmp(result.cols, cv::v_setzero_u32());
    for (int y = 0; y < result.rows; y++) {
        auto *resultPtr = resultStart + resultStep * y;
-        for (int x = 0; x < result.cols; x++) {
-            auto vSum = cv::vx_setall_u32(0);
-            for (int templateRow = 0; templateRow < templateImg.rows; templateRow++) {
-                auto *srcPtr = srcStart + srcStep * (y + templateRow) + x;
-                auto *temPtr = temStart + temStep * templateRow;

-                for (int i = 0; i < templateImg.cols; i += cv::v_uint8::nlanes) {
-                    auto vTem = cv::v_load_aligned(temPtr + i);
-                    auto vSrc = cv::v_load(srcPtr + i);
+        for (int templateRow = 0; templateRow < templateImg.rows; templateRow++) {
+            auto *temPtr = temStart + temStep * templateRow;

-#ifdef __aarch64__
-                    cv::v_uint16 vDot1;
-                    cv::v_uint16 vDot2;
-                    cv::v_uint32 v1;
-                    cv::v_uint32 v2;
-                    cv::v_uint32 v3;
-                    cv::v_uint32 v4;
-                    cv::v_mul_expand(vTem, vSrc, vDot1, vDot2);
-                    cv::v_expand(vDot1, v1, v2);
-                    cv::v_expand(vDot2, v3, v4);
-                    vSum += v1 + v2 + v3 + v4;
-#else
-                    vSum += cv::v_dotprod_expand(vSrc, vTem);
-#endif
+            for (int i = 0; i < templateImg.cols; i += simdSize(cv::v_uint8)) {
+                auto  vTem   = cv::v_load_aligned(temPtr + i);
+                auto *srcPtr = srcStart + srcStep * (y + templateRow) + i;
+
+                int x = 0;
+                for (int n = 0; n < result.cols / 4; n++) {
+                    auto vSrc = cv::v_load(srcPtr + x);
+                    tmp[ x ]  = cv::v_add(tmp[ x ], cv::v_dotprod_expand_fast(vSrc, vTem));
+                    x++;
+
+                    vSrc     = cv::v_load(srcPtr + x);
+                    tmp[ x ] = cv::v_add(tmp[ x ], cv::v_dotprod_expand_fast(vSrc, vTem));
+                    x++;
+
+                    vSrc     = cv::v_load(srcPtr + x);
+                    tmp[ x ] = cv::v_add(tmp[ x ], cv::v_dotprod_expand_fast(vSrc, vTem));
+                    x++;
+
+                    vSrc     = cv::v_load(srcPtr + x);
+                    tmp[ x ] = cv::v_add(tmp[ x ], cv::v_dotprod_expand_fast(vSrc, vTem));
+                    x++;
+                }
+                for (; x < result.cols; x++) {
+                    auto vSrc = cv::v_load(srcPtr + x);
+                    tmp[ x ]  = cv::v_add(tmp[ x ], cv::v_dotprod_expand_fast(vSrc, vTem));
                }
            }
+        }

-            auto sum       = cv::v_reduce_sum(vSum);
+        for (int x = 0; x < result.cols; x++) {
+            const auto sum = cv::v_reduce_sum(tmp[ x ]);
+            tmp[ x ]       = cv::v_setzero_u32();
            resultPtr[ x ] = static_cast<float>(sum);
        }
    }
 }
-
-inline void integralSum(const cv::v_uint16 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
-    auto sum  = src + cv::v_rotate_left<1>(src);
-    sum      += cv::v_rotate_left<2>(sum);
-    sum      += cv::v_rotate_left<4>(sum);
-
-    cv::v_uint32 v1;
-    cv::v_uint32 v2;
-    cv::v_expand(sum, v1, v2);
-    v1  += pre;
-    v2  += pre;
-    pre  = cv::v_setall_u32(cv::v_extract_n<cv::v_uint32::nlanes - 1>(v2));
-
-    cv::v_uint64 v3;
-    cv::v_uint64 v4;
-    cv::v_expand(v1, v3, v4);
-    cv::v_store(dst, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v3)) + cv::v_load(prevDst));
-    cv::v_store(dst + cv::v_float64::nlanes, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v4)) +
-                                                 cv::v_load(prevDst + cv::v_float64::nlanes));
-
-    cv::v_expand(v2, v3, v4);
-    cv::v_store(dst + cv::v_float64::nlanes * 2,
-                cv::v_cvt_f64(cv::v_reinterpret_as_s64(v3)) +
-                    cv::v_load(prevDst + cv::v_float64::nlanes * 2));
-    cv::v_store(dst + cv::v_float64::nlanes * 3,
-                cv::v_cvt_f64(cv::v_reinterpret_as_s64(v4)) +
-                    cv::v_load(prevDst + cv::v_float64::nlanes * 3));
-}
-
-inline void integralSqSum(cv::v_uint16 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
-    cv::v_uint32 v1;
-    cv::v_uint32 v2;
-    cv::v_expand(src, v1, v2);
-
-    {
-        auto         shift1 = cv::v_rotate_left<1>(src);
-        cv::v_uint32 v3;
-        cv::v_uint32 v4;
-        cv::v_expand(shift1, v3, v4);
-
-        v1 += v3;
-        v2 += v4;
-
-        v4  = cv::v_extract<2>(v1, v2);
-        v2 += v4;
-
-        v3  = cv::v_rotate_left<2>(v1);
-        v1 += v3;
-
-        v1 += pre;
-        v2 += v1;
-
-        pre = cv::v_setall_u32(cv::v_extract_n<cv::v_uint32::nlanes - 1>(v2));
-    }
-
-    cv::v_uint64 v3;
-    cv::v_uint64 v4;
-    cv::v_expand(v1, v3, v4);
-    cv::v_store(dst, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v3)) + cv::v_load(prevDst));
-    cv::v_store(dst + cv::v_float64::nlanes, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v4)) +
-                                                 cv::v_load(prevDst + cv::v_float64::nlanes));
-
-    cv::v_expand(v2, v3, v4);
-    cv::v_store(dst + cv::v_float64::nlanes * 2,
-                cv::v_cvt_f64(cv::v_reinterpret_as_s64(v3)) +
-                    cv::v_load(prevDst + cv::v_float64::nlanes * 2));
-    cv::v_store(dst + cv::v_float64::nlanes * 3,
-                cv::v_cvt_f64(cv::v_reinterpret_as_s64(v4)) +
-                    cv::v_load(prevDst + cv::v_float64::nlanes * 3));
-}
-
-/*
-inline void integralSqSum(cv::v_uint32 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
-    src += cv::v_rotate_left<1>(src);
-    src += cv::v_rotate_left<2>(src);
-    src += pre;
-    pre  = cv::v_setall_u32(cv::v_extract_n<cv::v_uint32::nlanes - 1>(src));
-
-    cv::v_uint64 v1;
-    cv::v_uint64 v2;
-    cv::v_expand(src, v1, v2);
-
-    cv::v_store(dst, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v1)) + cv::v_load(prevDst));
-    cv::v_store(dst + cv::v_float64::nlanes, cv::v_cvt_f64(cv::v_reinterpret_as_s64(v2)) +
-                                                 cv::v_load(prevDst + cv::v_float64::nlanes));
-}
-
-inline void integralSqSum(cv::v_uint16 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
-    cv::v_uint32 v1;
-    cv::v_uint32 v2;
-    cv::v_expand(src, v1, v2);
-    integralSqSum(v1, dst, prevDst, pre);
-    integralSqSum(v2, dst + cv::v_uint32::nlanes, prevDst + cv::v_uint32::nlanes, pre);
-}
-*/
-
-inline void integralSum(cv::v_uint16 &v1, cv::v_uint16 &v2, double *dst, double *prevDst,
-                        cv::v_uint32 &pre) {
-    integralSum(v1, dst, prevDst, pre);
-    integralSum(v2, dst + cv::v_uint16::nlanes, prevDst + cv::v_uint16::nlanes, pre);
-}
-
-inline void integralSqSum(cv::v_uint16 &v1, cv::v_uint16 &v2, double *dst, double *prevDst,
-                          cv::v_uint32 &pre) {
-    v1 = cv::v_mul_wrap(v1, v1);
-    v2 = cv::v_mul_wrap(v2, v2);
-
-    integralSqSum(v1, dst, prevDst, pre);
-    integralSqSum(v2, dst + cv::v_uint16::nlanes, prevDst + cv::v_uint16::nlanes, pre);
-}
-
-void integralSimd(const cv::Mat &src, cv::Mat &sum, cv::Mat &sqSum) {
-    const auto size = src.size() + cv::Size(1, 1);
-    sum.create(size, CV_64FC1);
-    sqSum.create(size, CV_64FC1);
-    memset(sum.data, 0, sum.rows * sum.step[ 0 ]);
-    memset(sqSum.data, 0, sqSum.rows * sqSum.step[ 0 ]);
-
-    const auto *srcStart   = src.ptr<uchar>();
-    const auto  srcStep    = src.step[ 0 ];
-    auto       *sumStart   = sum.ptr<double>(1) + 1;
-    const auto  sumStep    = sum.step[ 0 ] / sum.step[ 1 ];
-    auto       *sqSumStart = sqSum.ptr<double>(1) + 1;
-    const auto  sqSumStep  = sqSum.step[ 0 ] / sqSum.step[ 1 ];
-
-    for (int y = 0; y < src.rows; y++) {
-        auto *srcPtr      = srcStart + srcStep * y;
-        auto *sumPtr      = sumStart + sumStep * y;
-        auto *sqSumPtr    = sqSumStart + sqSumStep * y;
-        auto *preSumPtr   = sumStart + sumStep * (y - 1);
-        auto *preSqSumPtr = sqSumStart + sqSumStep * (y - 1);
-
-        cv::v_uint32 prevSum   = cv::vx_setzero_u32();
-        cv::v_uint32 prevSqSum = cv::vx_setzero_u32();
-        int          x         = 0;
-        for (; x < src.cols - cv::v_uint8::nlanes; x += cv::v_uint8::nlanes) {
-            cv::v_uint16 v1;
-            cv::v_uint16 v2;
-            cv::v_expand(cv::v_load(srcPtr + x), v1, v2);
-
-            integralSum(v1, v2, sumPtr + x, preSumPtr + x, prevSum);
-            integralSqSum(v1, v2, sqSumPtr + x, preSqSumPtr + x, prevSqSum);
-        }
-        auto prevSum2   = cv::v_extract_n<cv::v_uint32::nlanes - 1>(prevSum);
-        auto prevSqSum2 = cv::v_extract_n<cv::v_uint32::nlanes - 1>(prevSqSum);
-        for (; x < src.cols; x++) {
-            auto val       = srcPtr[ x ];
-            auto sqVal     = val * val;
-            prevSum2      += val;
-            sumPtr[ x ]    = prevSum2 + preSumPtr[ x ];
-            prevSqSum2    += sqVal;
-            sqSumPtr[ x ]  = prevSqSum2 + preSqSumPtr[ x ];
-        }
-    }
-}
 #endif

 void ccoeffDenominator(const cv::Mat &src, const cv::Size &templateSize, cv::Mat &result,
@ -427,32 +284,33 @@ void ccoeffDenominator(const cv::Mat &src, const cv::Size &templateSize, cv::Mat
    cv::integral(src, sum, sqSum, CV_64F);
 #endif

-    const auto *q0 = sqSum.ptr<double>(0);
+    const auto *q0 = reinterpret_cast<double *>(sqSum.data);
    const auto *q1 = q0 + templateSize.width;
-    const auto *q2 = sqSum.ptr<double>(templateSize.height);
+    const auto *q2 = reinterpret_cast<double *>(sqSum.data) + sqSum.step1() * templateSize.height;
    const auto *q3 = q2 + templateSize.width;

-    const auto *p0 = sum.ptr<double>(0);
+    const auto *p0 = reinterpret_cast<double *>(sum.data);
    const auto *p1 = p0 + templateSize.width;
-    const auto *p2 = sum.ptr<double>(templateSize.height);
+    const auto *p2 = reinterpret_cast<double *>(sum.data) + sum.step1() * templateSize.height;
    const auto *p3 = p2 + templateSize.width;

-    const auto step = sum.step / sizeof(double);
+    const auto step           = sum.step / sizeof(double);
+    auto      *resultStartPtr = reinterpret_cast<float *>(result.data);

    for (int y = 0; y < result.rows; y++) {
-        auto *scorePtr = result.ptr<float>(y);
+        auto *scorePtr = resultStartPtr + y * result.step1();
        auto  idx      = y * step;
        for (int x = 0; x < result.cols; x++, idx++) {
            auto      &score     = scorePtr[ x ];
            const auto partSum   = p0[ idx ] - p1[ idx ] - p2[ idx ] + p3[ idx ];
            const auto numerator = score - partSum * mean;

-            auto partSqSum    = q0[ idx ] - q1[ idx ] - q2[ idx ] + q3[ idx ];
-            auto partSqNormal = partSqSum - partSum * partSum * invArea;
+            const auto partSqSum    = q0[ idx ] - q1[ idx ] - q2[ idx ] + q3[ idx ];
+            auto       partSqNormal = partSqSum - partSum * partSum * invArea;

-            const auto diff = std::max(partSqNormal, 0.);
-            double     denominator =
-                (diff <= std::min(0.5, 10 * FLT_EPSILON * partSqSum)) ? 0 : sqrt(diff) * normal;
+            const auto   diff = std::max(partSqNormal, 0.);
+            const double denominator =
+                diff <= std::min(0.5, 10 * FLT_EPSILON * partSqSum) ? 0 : sqrt(diff) * normal;

            if (abs(numerator) < denominator) {
                score = static_cast<float>(numerator / denominator);
@ -507,23 +365,24 @@ void nextMaxLoc(cv::Mat &score, const cv::Point &pos, const cv::Size templateSiz
 }

 inline cv::Mat getRotationMatrix2D(const cv::Point2f &center, double angle) {
-    angle        *= CV_PI / 180;
-    double alpha  = std::cos(angle);
-    double beta   = std::sin(angle);
+    angle              *= CV_PI / 180;
+    const double alpha  = std::cos(angle);
+    const double beta   = std::sin(angle);

-    cv::Mat rotate(2, 3, CV_64FC1);
-    auto    ptr = rotate.ptr<double>();
-    ptr[ 0 ]    = alpha;
-    ptr[ 1 ]    = beta;
-    ptr[ 2 ]    = (1 - alpha) * center.x - beta * center.y;
-    ptr[ 3 ]    = -beta;
-    ptr[ 4 ]    = alpha;
-    ptr[ 5 ]    = beta * center.x + (1 - alpha) * center.y;
+    cv::Mat    rotate(2, 3, CV_64FC1);
+    const auto ptr = rotate.ptr<double>();
+    ptr[ 0 ]       = alpha;
+    ptr[ 1 ]       = beta;
+    ptr[ 2 ]       = (1 - alpha) * center.x - beta * center.y;
+    ptr[ 3 ]       = -beta;
+    ptr[ 4 ]       = alpha;
+    ptr[ 5 ]       = beta * center.x + (1 - alpha) * center.y;

    return rotate;
 }

-inline cv::Point2d transform(const cv::Point2d &point, const cv::Point &center, double angle) {
+inline cv::Point2d transform(const cv::Point2d &point, const cv::Point &center,
+                             const double angle) {
    const auto rotate = getRotationMatrix2D(center, angle);

    return transform(point, rotate);
@ -617,7 +476,7 @@ Model *trainModel(const cv::Mat &src, int level) {
    model.reserve(pyramids.size());

    for (const auto &pyramid : pyramids) {
-        int     alignedWidth = static_cast<int>(cv::alignSize(pyramid.cols, cv::v_uint8::nlanes));
+        int     alignedWidth = static_cast<int>(cv::alignSize(pyramid.cols, simdSize(cv::v_uint8)));
        auto    img          = cv::Mat::zeros(pyramid.rows, alignedWidth, CV_8UC1);
        cv::Mat sub          = img(cv::Rect(0, 0, pyramid.cols, pyramid.rows));
        for (int y = 0; y < pyramid.rows; y++) {
@ -664,7 +523,7 @@ std::vector<Candidate> matchTopLevel(const cv::Mat &dstTop, double startAngle, d
    auto        angleStep         = sizeAngleStep(templateTop.size());
    cv::Point2d center            = sizeCenter(dstTop.size());
    const auto  topScoreThreshold = minScore * pow(0.9, level);
-    bool calMaxByBlock = (dstTop.size().area() / templateTop.size().area() > 500) && maxCount > 10;
+    bool calMaxByBlock = dstTop.size().area() / templateTop.size().area() > 500 && maxCount > 10;

    const auto count = static_cast<int>(spanAngle / angleStep) + 1;
 #pragma omp parallel for reduction(combine : candidates)
@ -728,13 +587,55 @@ std::vector<Candidate> matchTopLevel(const cv::Mat &dstTop, double startAngle, d
    return candidates;
 }

+cv::Point2f computeSubpixel(const cv::Mat &score) {
+    cv::Point2f result(0, 0);
+    auto       *mag = score.ptr<float>();
+
+    const auto gx  = (-mag[ 0 ] + mag[ 2 ] - mag[ 3 ] + mag[ 5 ] - mag[ 6 ] + mag[ 8 ]) / 3.0f;
+    const auto gy  = (mag[ 6 ] + mag[ 7 ] + mag[ 8 ] - mag[ 0 ] - mag[ 1 ] - mag[ 2 ]) / 3.0f;
+    const auto gxx = (mag[ 0 ] - 2.0f * mag[ 1 ] + mag[ 2 ] + mag[ 3 ] - 2.0f * mag[ 4 ] +
+                      mag[ 5 ] + mag[ 6 ] - 2.0f * mag[ 7 ] + mag[ 8 ]) /
+                     6.0f;
+    const auto gxy = (-mag[ 0 ] + mag[ 2 ] + mag[ 6 ] - mag[ 8 ]) / 2.0f;
+    const auto gyy = (mag[ 0 ] + mag[ 1 ] + mag[ 2 ] - 2.0f * (mag[ 3 ] + mag[ 4 ] + mag[ 5 ]) +
+                      mag[ 6 ] + mag[ 7 ] + mag[ 8 ]) /
+                     6.0f;
+
+    cv::Mat hessian(2, 2, CV_32FC1);
+    hessian.at<float>(0, 0) = gxx;
+    hessian.at<float>(0, 1) = gxy;
+    hessian.at<float>(1, 0) = gxy;
+    hessian.at<float>(1, 1) = gyy;
+
+    cv::Mat eigenvalue;
+    cv::Mat eigenvector;
+    cv::eigen(hessian, eigenvalue, eigenvector);
+    float nx, ny;
+    if (fabs(eigenvalue.at<float>(0, 0)) >= fabs(eigenvalue.at<float>(1, 0))) {
+        nx = eigenvector.at<float>(0, 0);
+        ny = eigenvector.at<float>(0, 1);
+    } else {
+        nx = eigenvector.at<float>(1, 0);
+        ny = eigenvector.at<float>(1, 1);
+    }
+
+    const auto denominator = gxx * nx * nx + 2 * gxy * nx * ny + gyy * ny * ny;
+    if (denominator != 0.0) {
+        const auto t = -(gx * nx + gy * ny) / denominator;
+        result.x     = t * nx;
+        result.y     = t * ny;
+    }
+
+    return result;
+}
+
 std::vector<Candidate> matchDownLevel(const std::vector<cv::Mat>   &pyramids,
                                      const std::vector<Candidate> &candidates, double minScore,
                                      int subpixel, const Model *model, int level) {
    std::vector<Candidate> levelMatched;
    auto                   count = static_cast<int>(candidates.size());

-#pragma omp parallel for reduction(combine : levelMatched)
+#pragma omp parallel for reduction(combine : levelMatched) schedule(dynamic)
    for (int index = 0; index < count; index++) {
        auto pose    = candidates[ index ];
        bool matched = true;
@ -790,7 +691,9 @@ std::vector<Candidate> matchDownLevel(const std::vector<cv::Mat>   &pyramids,
            }

            if (!newScoreRect.empty()) {
-                // TODO subpixel
+                auto offset         = computeSubpixel(newScoreRect);
+                newCandidate.pos.x += offset.x;
+                newCandidate.pos.y += offset.y;
            }

            // back to
@ -899,8 +802,9 @@ Model_t trainModel(const unsigned char *data, int width, int height, int channel
        return nullptr;
    }

-    auto    type = channels == 1 ? CV_8UC1 : (channels == 3 ? CV_8UC3 : CV_8UC4);
-    cv::Mat img(cv::Size(width, height), type, const_cast<unsigned char *>(data), bytesPerLine);
+    const auto    type = channels == 1 ? CV_8UC1 : channels == 3 ? CV_8UC3 : CV_8UC4;
+    const cv::Mat img(cv::Size(width, height), type, const_cast<unsigned char *>(data),
+                      bytesPerLine);

    cv::Mat src;
    if (1 == channels) {
@ -909,9 +813,9 @@ Model_t trainModel(const unsigned char *data, int width, int height, int channel
        cv::cvtColor(img, src, channels == 3 ? cv::COLOR_RGB2GRAY : cv::COLOR_RGBA2GRAY);
    }

-    cv::Rect rect(roiLeft, roiTop, roiWidth, roiHeight);
-    cv::Rect imageRect(0, 0, width, height);
-    auto     roi = rect & imageRect;
+    const cv::Rect rect(roiLeft, roiTop, roiWidth, roiHeight);
+    const cv::Rect imageRect(0, 0, width, height);
+    const auto     roi = rect & imageRect;
    if (roi.empty()) {
        return nullptr;
    }
@ -920,7 +824,7 @@ Model_t trainModel(const unsigned char *data, int width, int height, int channel
 }

 void matchModel(const unsigned char *data, int width, int height, int channels, int bytesPerLine,
-                int roiLeft, int roiTop, int roiWidth, int roiHeight, const Model_t model,
+                int roiLeft, int roiTop, int roiWidth, int roiHeight, Model *const model,
                int *count, Pose *poses, int level, double startAngle, double spanAngle,
                double maxOverlap, double minScore, int subpixel) {
    if (nullptr == count) {
@ -937,8 +841,9 @@ void matchModel(const unsigned char *data, int width, int height, int channels,
        return;
    }

-    auto    type = channels == 1 ? CV_8UC1 : (channels == 3 ? CV_8UC3 : CV_8UC4);
-    cv::Mat img(cv::Size(width, height), type, const_cast<unsigned char *>(data), bytesPerLine);
+    const auto    type = channels == 1 ? CV_8UC1 : channels == 3 ? CV_8UC3 : CV_8UC4;
+    const cv::Mat img(cv::Size(width, height), type, const_cast<unsigned char *>(data),
+                      bytesPerLine);

    cv::Mat dst;
    if (1 == channels) {
@ -947,18 +852,18 @@ void matchModel(const unsigned char *data, int width, int height, int channels,
        cv::cvtColor(img, dst, channels == 3 ? cv::COLOR_RGB2GRAY : cv::COLOR_RGBA2GRAY);
    }

-    cv::Rect rect(roiLeft, roiTop, roiWidth, roiHeight);
-    cv::Rect imageRect(0, 0, width, height);
-    auto     roi = rect & imageRect;
+    const cv::Rect rect(roiLeft, roiTop, roiWidth, roiHeight);
+    const cv::Rect imageRect(0, 0, width, height);
+    const auto     roi = rect & imageRect;
    if (roi.empty()) {
        *count = 0;
        return;
    }

-    auto result = matchModel(dst(roi), model, level, startAngle, spanAngle, maxOverlap, minScore,
-                             *count, subpixel);
+    const auto result = matchModel(dst(roi), model, level, startAngle, spanAngle, maxOverlap,
+                                   minScore, *count, subpixel);

-    auto size = std::min(*count, static_cast<int>(result.size()));
+    const auto size = std::min(*count, static_cast<int>(result.size()));
    for (int i = 0; i < size; i++) {
        const auto &pose = result[ i ];
        poses[ i ]       = {pose.x + static_cast<float>(roi.x), pose.y + static_cast<float>(roi.y),
@ -977,7 +882,7 @@ void freeModel(Model_t *model) {
    *model = nullptr;
 }

-int modelLevel(const Model *model) {
+int modelLevel(Model *const model) {
    if (nullptr == model) {
        return 0;
    }
@ -985,7 +890,7 @@ int modelLevel(const Model *model) {
    return static_cast<int>(model->pyramids.size());
 }

-void modelImage(const Model *model, int level, unsigned char *data, int length, int *width,
+void modelImage(Model *const model, int level, unsigned char *data, int length, int *width,
                int *height, int *channels) {
    if (nullptr == model) {
        return;
--- a/grayMatch.h
+++ b/grayMatch.h
@ -55,16 +55,15 @@ API_PUBLIC Model_t trainModel(const unsigned char *data, int width, int height,
 */
 API_PUBLIC void matchModel(const unsigned char *data, int width, int height, int channels,
                           int bytesPerLine, int roiLeft, int roiTop, int roiWidth, int roiHeight,
-                           const Model_t model, int *count, Pose *poses, int level,
-                           double startAngle, double spanAngle, double maxOverlap, double minScore,
-                           int subpixel);
+                           Model_t model, int *count, Pose *poses, int level, double startAngle,
+                           double spanAngle, double maxOverlap, double minScore, int subpixel);

 /**
 * @brief get trained model levels
 * @param model
 * @return pyramid level
 */
-API_PUBLIC int modelLevel(const Model_t model);
+API_PUBLIC int modelLevel(Model_t model);

 /**
 * @brief get trained model image
@ -77,8 +76,8 @@ API_PUBLIC int modelLevel(const Model_t model);
 * @param channels image channels, can input nullptr
 * @return
 */
-API_PUBLIC void modelImage(const Model_t model, int level, unsigned char *data, int length,
-                           int *width, int *height, int *channels);
+API_PUBLIC void modelImage(Model_t model, int level, unsigned char *data, int length, int *width,
+                           int *height, int *channels);

 /**
 * @brief free model
@ -94,7 +93,7 @@ API_PUBLIC void freeModel(Model_t *model);
 * @param size in(buffer size)/out(written size)
 * @return true(success)false(failed)
 */
-API_PUBLIC bool serialize(const Model_t model, unsigned char *buffer, int *size);
+API_PUBLIC bool serialize(Model_t model, unsigned char *buffer, int *size);

 /**
 * @brief deserialize model
--- a/integral.cpp
+++ b/integral.cpp
@ -0,0 +1,180 @@
+#include "integral.h"
+#include "privateType.h"
+
+#include <opencv2/core/hal/intrin.hpp>
+
+inline void expand(const cv::v_int32 &src, cv::v_float64 &low, cv::v_float64 &high) {
+    low  = cv::v_cvt_f64(src);
+    high = cv::v_cvt_f64_high(src);
+}
+
+inline void integralSum(const cv::v_uint16 &src, double *dst, const double *prevDst,
+                        cv::v_uint32 &pre) {
+    auto sum = cv::v_add(src, cv::v_rotate_left<1>(src));
+    sum      = cv::v_add(sum, cv::v_rotate_left<2>(sum));
+    sum      = cv::v_add(sum, cv::v_rotate_left<4>(sum));
+
+    cv::v_uint32 v1;
+    cv::v_uint32 v2;
+    cv::v_expand(sum, v1, v2);
+    v1  = cv::v_add(v1, pre);
+    v2  = cv::v_add(v2, pre);
+    pre = cv::v_setall_u32(cv::v_extract_n<simdSize(cv::v_uint32) - 1>(v2));
+
+    cv::v_float64 v3;
+    cv::v_float64 v4;
+    expand(cv::v_reinterpret_as_s32(v1), v3, v4);
+    cv::v_store(dst, cv::v_add(v3, cv::v_load(prevDst)));
+    cv::v_store(dst + simdSize(cv::v_float64),
+                cv::v_add(v4, cv::v_load(prevDst + simdSize(cv::v_float64))));
+
+    expand(cv::v_reinterpret_as_s32(v2), v3, v4);
+    cv::v_store(dst + simdSize(cv::v_float64) * 2,
+                cv::v_add(v3, cv::v_load(prevDst + simdSize(cv::v_float64) * 2)));
+    cv::v_store(dst + simdSize(cv::v_float64) * 3,
+                cv::v_add(v4, cv::v_load(prevDst + simdSize(cv::v_float64) * 3)));
+}
+
+inline void integralSqSum(cv::v_uint16 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
+    cv::v_uint32 v1;
+    cv::v_uint32 v2;
+    cv::v_expand(src, v1, v2);
+
+    {
+        auto         shift1 = cv::v_rotate_left<1>(src);
+        cv::v_uint32 v3;
+        cv::v_uint32 v4;
+        cv::v_expand(shift1, v3, v4);
+
+        v1 = cv::v_add(v1, v3);
+        v2 = cv::v_add(v2, v4);
+
+        v4 = cv::v_extract<2>(v1, v2);
+        v2 = cv::v_add(v2, v4);
+
+        v3 = cv::v_rotate_left<2>(v1);
+        v1 = cv::v_add(v1, v3);
+
+        v1 = cv::v_add(v1, pre);
+        v2 = cv::v_add(v2, v1);
+
+        pre = cv::v_setall_u32(cv::v_extract_n<simdSize(cv::v_uint32) - 1>(v2));
+    }
+
+    cv::v_float64 v3;
+    cv::v_float64 v4;
+    expand(cv::v_reinterpret_as_s32(v1), v3, v4);
+    cv::v_store(dst, cv::v_add(v3, cv::v_load(prevDst)));
+    cv::v_store(dst + simdSize(cv::v_float64),
+                cv::v_add(v4, cv::v_load(prevDst + simdSize(cv::v_float64))));
+
+    expand(cv::v_reinterpret_as_s32(v2), v3, v4);
+    cv::v_store(dst + simdSize(cv::v_float64) * 2,
+                cv::v_add(v3, cv::v_load(prevDst + simdSize(cv::v_float64) * 2)));
+    cv::v_store(dst + simdSize(cv::v_float64) * 3,
+                cv::v_add(v4, cv::v_load(prevDst + simdSize(cv::v_float64) * 3)));
+}
+
+/*
+inline void integralSqSum(cv::v_uint32 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
+    src += cv::v_rotate_left<1>(src);
+    src += cv::v_rotate_left<2>(src);
+    src += pre;
+    pre  = cv::v_setall_u32(cv::v_extract_n<simdSize(cv::v_uint32) - 1>(src));
+
+    cv::v_float64 v1;
+    cv::v_float64 v2;
+    expand(cv::v_reinterpret_as_s32(src), v1, v2);
+
+    cv::v_store(dst, v1 + cv::v_load(prevDst));
+    cv::v_store(dst + simdSize(cv::v_float64), v2 + cv::v_load(prevDst +
+simdSize(cv::v_float64)));
+}
+
+inline void integralSqSum(cv::v_uint16 &src, double *dst, double *prevDst, cv::v_uint32 &pre) {
+    cv::v_uint32 v1;
+    cv::v_uint32 v2;
+    cv::v_expand(src, v1, v2);
+    integralSqSum(v1, dst, prevDst, pre);
+    integralSqSum(v2, dst + simdSize(cv::v_uint32), prevDst + simdSize(cv::v_uint32),
+pre);
+}
+*/
+
+inline void integralSum(const cv::v_uint16 &v1, const cv::v_uint16 &v2, double *dst,
+                        const double *prevDst, cv::v_uint32 &pre) {
+    integralSum(v1, dst, prevDst, pre);
+    integralSum(v2, dst + simdSize(cv::v_uint16), prevDst + simdSize(cv::v_uint16), pre);
+}
+
+inline void integralSqSum(cv::v_uint16 &v1, cv::v_uint16 &v2, double *dst, double *prevDst,
+                          cv::v_uint32 &pre) {
+    v1 = cv::v_mul_wrap(v1, v1);
+    v2 = cv::v_mul_wrap(v2, v2);
+
+    integralSqSum(v1, dst, prevDst, pre);
+    integralSqSum(v2, dst + simdSize(cv::v_uint16), prevDst + simdSize(cv::v_uint16), pre);
+}
+
+void integralSimd(const cv::Mat &src, cv::Mat &sum, cv::Mat &sqSum) {
+    const auto size = src.size() + cv::Size(1, 1);
+    sum.create(size, CV_64FC1);
+    sqSum.create(size, CV_64FC1);
+    memset(sum.data, 0, sum.step[ 0 ]);
+    memset(sqSum.data, 0, sqSum.step[ 0 ]);
+
+    const auto *srcStart   = src.data;
+    const auto  srcStep    = src.step[ 0 ];
+    auto       *sumStart   = reinterpret_cast<double *>(sum.data) + sum.step1() + 1;
+    const auto  sumStep    = sum.step[ 0 ] / sum.step[ 1 ];
+    auto       *sqSumStart = reinterpret_cast<double *>(sqSum.data) + sqSum.step1() + 1;
+    const auto  sqSumStep  = sqSum.step[ 0 ] / sqSum.step[ 1 ];
+    const auto  end        = size.width - simdSize(cv::v_uint8);
+    for (int y = 0; y < src.rows; y++) {
+        auto       *srcPtr    = srcStart + srcStep * y;
+        auto       *sumPtr    = sumStart + sumStep * y;
+        const auto *preSumPtr = sumStart + sumStep * (y - 1);
+        sumPtr[ -1 ]          = 0;
+
+        cv::v_uint32 prevSum = cv::vx_setzero_u32();
+        for (int x = 0; x < end; x += simdSize(cv::v_uint8)) {
+            cv::v_uint16 v1;
+            cv::v_uint16 v2;
+            cv::v_expand(cv::v_load(srcPtr + x), v1, v2);
+
+            integralSum(v1, v2, sumPtr + x, preSumPtr + x, prevSum);
+        }
+    }
+
+    for (int y = 0; y < src.rows; y++) {
+        auto *srcPtr      = srcStart + srcStep * y;
+        auto *sqSumPtr    = sqSumStart + sqSumStep * y;
+        auto *preSqSumPtr = sqSumStart + sqSumStep * (y - 1);
+        sqSumPtr[ -1 ]    = 0;
+
+        cv::v_uint32 prevSqSum = cv::vx_setzero_u32();
+        for (int x = 0; x < end; x += simdSize(cv::v_uint8)) {
+            cv::v_uint16 v1;
+            cv::v_uint16 v2;
+            cv::v_expand(cv::v_load(srcPtr + x), v1, v2);
+
+            integralSqSum(v1, v2, sqSumPtr + x, preSqSumPtr + x, prevSqSum);
+        }
+    }
+
+    const auto start = src.cols - src.cols % simdSize(cv::v_uint8);
+    for (int y = 0; y < src.rows; y++) {
+        auto       *srcPtr      = srcStart + srcStep * y;
+        auto       *sumPtr      = sumStart + sumStep * y;
+        auto       *sqSumPtr    = sqSumStart + sqSumStep * y;
+        const auto *preSumPtr   = sumStart + sumStep * (y - 1);
+        const auto *preSqSumPtr = sqSumStart + sqSumStep * (y - 1);
+        for (int x = start; x < src.cols; x++) {
+            const auto val   = srcPtr[ x ];
+            const auto sqVal = val * val;
+
+            sumPtr[ x ]   = sumPtr[ x - 1 ] + val + preSumPtr[ x ] - preSumPtr[ x - 1 ];
+            sqSumPtr[ x ] = sqSumPtr[ x - 1 ] + sqVal + preSqSumPtr[ x ] - preSqSumPtr[ x - 1 ];
+        }
+    }
+}
--- a/integral.h
+++ b/integral.h
@ -0,0 +1,8 @@
+#ifndef INTEGRAL_H
+#define INTEGRAL_H
+
+#include <opencv2/opencv.hpp>
+
+void integralSimd(const cv::Mat &src, cv::Mat &sum, cv::Mat &sqSum);
+
+#endif // INTEGRAL_H
--- a/main.cpp
+++ b/main.cpp
@ -2,17 +2,46 @@

 #include <fstream>
 #include <iostream>
+#include <opencv2/core/utility.hpp>
 #include <opencv2/opencv.hpp>

-int main() {
-    auto src = cv::imread(std::string(IMG_DIR) + "/3.bmp", cv::IMREAD_GRAYSCALE);
-    auto dst = cv::imread(std::string(IMG_DIR) + "/h.bmp", cv::IMREAD_GRAYSCALE);
+int main(int argc, const char *argv[]) {
+    const std::string keys = "{model m || model image}"
+                             "{scene s || scene image}"
+                             "{view v || view result}"
+                             "{threshold t | 0.7 | match minium score}"
+                             "{bench b || match benchmark}"
+                             "{help h || print this help}";
+
+    cv::CommandLineParser cmd(argc, argv, keys);
+    if (!cmd.check()) {
+        cmd.printErrors();
+        return -1;
+    }
+
+    if (cmd.has("help")) {
+        cmd.printMessage();
+        return 0;
+    }
+
+    auto srcFile = std::string(IMG_DIR) + "/3.bmp";
+    auto dstFile = std::string(IMG_DIR) + "/h.bmp";
+    if (cmd.has("model"))
+        srcFile = cmd.get<std::string>("model");
+    if (cmd.has("scene"))
+        dstFile = cmd.get<std::string>("scene");
+
+    auto src = cv::imread(srcFile, cv::IMREAD_GRAYSCALE);
+    auto dst = cv::imread(dstFile, cv::IMREAD_GRAYSCALE);
+    if (src.empty() || dst.empty()) {
+        return -1;
+    }

    const std::string modelName("model.bin");
    {
        auto t0    = cv::getTickCount();
-        auto model = trainModel(src.data, src.cols, src.rows, src.channels(), int(src.step), 0, 0,
-                                src.cols, src.rows, -1);
+        auto model = trainModel(src.data, src.cols, src.rows, src.channels(),
+                                static_cast<int>(src.step), 0, 0, src.cols, src.rows, -1);
        auto t1    = cv::getTickCount();

        // get size
@ -28,16 +57,18 @@ int main() {
        if (!ofs.is_open()) {
            return -1;
        }
-        ofs.write((const char *)buffer.data(), size);
+        ofs.write(reinterpret_cast<const char *>(buffer.data()), size);

        freeModel(&model);

-        auto trainCost = double(t1 - t0) / cv::getTickFrequency();
+        auto trainCost = static_cast<double>(t1 - t0) / cv::getTickFrequency();
        std::cout << "train(s):" << trainCost << std::endl;
    }

-    int               num = 70;
-    std::vector<Pose> poses(num);
+    int               count = 70;
+    std::vector<Pose> poses(count);
+    Model_t           model;
+    auto              score = cmd.get<float>("threshold");
    {
        // open file
        std::ifstream ifs(modelName, std::ios::binary | std::ios::in);
@ -52,39 +83,59 @@ int main() {

        // read to buffer
        std::vector<uchar> buffer(size);
-        ifs.read((char *)buffer.data(), size);
+        ifs.read(reinterpret_cast<char *>(buffer.data()), size);

        // deserialize from buffer
-        auto model = deserialize(buffer.data(), int(buffer.size()));
+        model = deserialize(buffer.data(), static_cast<int>(buffer.size()));

        auto t2 = cv::getTickCount();
-        matchModel(dst.data, dst.cols, dst.rows, dst.channels(), int(dst.step), 0, 0, dst.cols,
-                   dst.rows, model, &num, poses.data(), -1, 0, 360, 0, 0.5, 1);
+        matchModel(dst.data, dst.cols, dst.rows, dst.channels(), static_cast<int>(dst.step), 0, 0,
+                   dst.cols, dst.rows, model, &count, poses.data(), -1, 0, 360, 0, score, 1);
        auto t3 = cv::getTickCount();

-        auto matchCost = double(t3 - t2) / cv::getTickFrequency();
+        auto matchCost = static_cast<double>(t3 - t2) / cv::getTickFrequency();
        std::cout << "match(s):" << matchCost << std::endl;
    }

-    cv::Mat color;
-    cv::cvtColor(dst, color, cv::COLOR_GRAY2RGB);
-    for (int i = 0; i < num; i++) {
-        auto           &pose = poses[ i ];
-        cv::RotatedRect rect(cv::Point2f(pose.x, pose.y), src.size(), -pose.angle);
-
-        cv::Point2f pts[ 4 ];
-        rect.points(pts);
-
-        cv::line(color, pts[ 0 ], pts[ 1 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
-        cv::line(color, pts[ 1 ], pts[ 2 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
-        cv::line(color, pts[ 2 ], pts[ 3 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
-        cv::line(color, pts[ 3 ], pts[ 0 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
-
+    for (int i = 0; i < count; i++) {
+        const auto &pose = poses[ i ];
        std::cout << pose.x << "," << pose.y << "," << pose.angle << "," << pose.score << std::endl;
    }

-    cv::imshow("img", color);
-    cv::waitKey();
+    if (cmd.has("bench")) {
+        constexpr int times = 100;
+
+        auto start = cv::getTickCount();
+        for (int i = 0; i < times; i++) {
+            matchModel(dst.data, dst.cols, dst.rows, dst.channels(), static_cast<int>(dst.step), 0,
+                       0, dst.cols, dst.rows, model, &count, poses.data(), -1, 0, 360, 0, score, 1);
+            count = 70;
+        }
+        auto end = cv::getTickCount();
+
+        const auto cost = static_cast<double>(end - start) / cv::getTickFrequency() / times;
+        std::cout << "match bench avg(s):" << cost << std::endl;
+    }
+
+    if (cmd.has("view")) {
+        cv::Mat color;
+        cv::cvtColor(dst, color, cv::COLOR_GRAY2RGB);
+        for (int i = 0; i < count; i++) {
+            const auto     &pose = poses[ i ];
+            cv::RotatedRect rect(cv::Point2f(pose.x, pose.y), src.size(), -pose.angle);
+
+            cv::Point2f pts[ 4 ];
+            rect.points(pts);
+
+            cv::line(color, pts[ 0 ], pts[ 1 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
+            cv::line(color, pts[ 1 ], pts[ 2 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
+            cv::line(color, pts[ 2 ], pts[ 3 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
+            cv::line(color, pts[ 3 ], pts[ 0 ], cv::Scalar(255, 0, 0), 1, cv::LINE_AA);
+        }
+
+        cv::imshow("img", color);
+        cv::waitKey();
+    }

    return 0;
 }
--- a/privateType.h
+++ b/privateType.h
@ -32,4 +32,10 @@ struct Model {
        mean.reserve(size);
        equal1.reserve(size);
    }
-};
+};
+
+#if CV_VERSION_MAJOR >= 4 && CV_VERSION_MINOR >= 8
+#define simdSize(type) cv::VTraits<type>::nlanes
+#else
+#define simdSize(type) type::nlanes
+#endif
--- a/serialize.cpp
+++ b/serialize.cpp
@ -5,7 +5,7 @@

 class Buffer {
 public:
-    Buffer(int size_, unsigned char *data_)
+    Buffer(const int size_, unsigned char *data_)
        : m_size(size_)
        , m_data(data_) {}

@ -35,29 +35,29 @@ protected:
    unsigned char *m_data = nullptr;
 };

-void binWrite(void *dst, void *src, int size) {
+void binWrite(void *const dst, const void *src, const int size) {
    memcpy(dst, src, size);
 }

-void fakeWrite(void *dst, void *src, int size) {
-    (void)(dst);
-    (void)(src);
-    (void)(size);
+void fakeWrite(void *const dst, const void *src, const int size) {
+    (void)dst;
+    (void)src;
+    (void)size;
 }

-using Write = void (*)(void *, void *, int);
+using Write = void (*)(void *, const void *, int);

-template <Write write> class OutBuffer : public Buffer {
+template <Write write> class OutBuffer final : public Buffer {
 public:
-    explicit OutBuffer(unsigned char *data_)
+    explicit OutBuffer(unsigned char *const data_)
        : Buffer(0, data_) {}

-    void operator&(uchar &val) final {
+    void operator&(uchar &val) override {
        write(m_data + m_size, &val, sizeof(val));
        m_size += static_cast<int>(sizeof(val));
    }
-    void operator&(std::vector<cv::Mat> &val) final {
-        int size = static_cast<int>(val.size());
+    void operator&(std::vector<cv::Mat> &val) override {
+        const int size = static_cast<int>(val.size());
        write(m_data + m_size, &size, sizeof(size));
        m_size += static_cast<int>(sizeof(size));

@ -77,8 +77,8 @@ public:
            m_size += val.cols;
        }
    }
-    void operator&(std::vector<cv::Scalar> &val) final {
-        int size = static_cast<int>(val.size());
+    void operator&(std::vector<cv::Scalar> &val) override {
+        const int size = static_cast<int>(val.size());
        write(m_data + m_size, &size, sizeof(size));
        m_size += static_cast<int>(sizeof(size));

@ -86,20 +86,20 @@ public:
            writeElement(element);
        }
    }
-    void writeElement(cv::Scalar &val) {
+    void writeElement(const cv::Scalar &val) {
        write(m_data + m_size, val.val, sizeof(double) * 4);
        m_size += static_cast<int>(sizeof(double)) * 4;
    }
-    void operator&(std::vector<double> &val) final {
-        int size = static_cast<int>(val.size());
+    void operator&(std::vector<double> &val) override {
+        const int size = static_cast<int>(val.size());
        write(m_data + m_size, &size, sizeof(size));
        m_size += static_cast<int>(sizeof(size));

        write(m_data + m_size, val.data(), static_cast<int>(sizeof(double)) * size);
        m_size += static_cast<int>(sizeof(double)) * size;
    }
-    void operator&(std::vector<uchar> &val) final {
-        int size = static_cast<int>(val.size());
+    void operator&(std::vector<uchar> &val) override {
+        const int size = static_cast<int>(val.size());
        write(m_data + m_size, &size, sizeof(size));
        m_size += static_cast<int>(sizeof(size));

@ -111,16 +111,16 @@ public:
 using SizeCountBuffer = OutBuffer<fakeWrite>;
 using WriteBuffer     = OutBuffer<binWrite>;

-class ReadBuffer : public Buffer {
+class ReadBuffer final : public Buffer {
 public:
    explicit ReadBuffer(unsigned char *data_)
        : Buffer(0, data_) {}

-    void operator&(uchar &val) final {
+    void operator&(uchar &val) override {
        memcpy(&val, m_data + m_size, sizeof(uchar));
        m_size += static_cast<int>(sizeof(uchar));
    }
-    void operator&(std::vector<cv::Mat> &val) final {
+    void operator&(std::vector<cv::Mat> &val) override {
        int count = 0;
        memcpy(&count, m_data + m_size, sizeof(int));
        val.resize(count);
@ -139,9 +139,9 @@ public:
        memcpy(&height, m_data + m_size, sizeof(int));
        m_size += static_cast<int>(sizeof(int));

-        int  alignedWidth = static_cast<int>(cv::alignSize(width, cv::v_uint8::nlanes));
-        auto img          = cv::Mat::zeros(height, alignedWidth, CV_8UC1);
-        val               = img(cv::Rect(0, 0, width, height));
+        const int  alignedWidth = static_cast<int>(cv::alignSize(width, simdSize(cv::v_uint8)));
+        const auto img          = cv::Mat::zeros(height, alignedWidth, CV_8UC1);
+        val                     = img(cv::Rect(0, 0, width, height));

        for (int y = 0; y < height; y++) {
            auto *ptr = val.ptr<uchar>(y);
@ -149,7 +149,7 @@ public:
            m_size += width;
        }
    }
-    void operator&(std::vector<cv::Scalar> &val) final {
+    void operator&(std::vector<cv::Scalar> &val) override {
        int count = 0;
        memcpy(&count, m_data + m_size, sizeof(int));
        val.resize(count);
@ -163,7 +163,7 @@ public:
        memcpy(val.val, m_data + m_size, sizeof(double) * 4);
        m_size += static_cast<int>(sizeof(double)) * 4;
    }
-    void operator&(std::vector<double> &val) final {
+    void operator&(std::vector<double> &val) override {
        int count = 0;
        memcpy(&count, m_data + m_size, sizeof(int));
        val.resize(count);
@ -172,7 +172,7 @@ public:
        memcpy(val.data(), m_data + m_size, sizeof(double) * count);
        m_size += static_cast<int>(sizeof(double)) * count;
    }
-    void operator&(std::vector<uchar> &val) final {
+    void operator&(std::vector<uchar> &val) override {
        int count = 0;
        memcpy(&count, m_data + m_size, sizeof(int));
        val.resize(count);
@ -184,10 +184,10 @@ public:
 };

 void operation(Buffer *buf, Model &model) {
-    (*buf) & (model);
+    *buf &model;
 }

-bool serialize(const Model_t model, unsigned char *buffer, int *size) {
+bool serialize(Model *const model, unsigned char *buffer, int *size) {
    if (nullptr == size) {
        return false;
    }
@ -199,9 +199,9 @@ bool serialize(const Model_t model, unsigned char *buffer, int *size) {

    SizeCountBuffer counter(buffer);
    operation(&counter, *model);
-    *size = counter.count();

    if (nullptr == buffer) {
+        *size = counter.count();
        return true;
    }

@ -215,7 +215,7 @@ bool serialize(const Model_t model, unsigned char *buffer, int *size) {
    return true;
 }

-Model_t deserialize(unsigned char *buffer, int size) {
+Model_t deserialize(unsigned char *buffer, const int size) {
    if (size < 1 || nullptr == buffer) {
        return nullptr;
    }
Author	SHA1	Message	Date
y.qiu	64298e45eb	static check	2025-05-08 17:56:22 +08:00
y.qiu	93bef37422	fix: mismatch faction sign	2025-03-27 13:43:37 +08:00
y.qiu	58cdd2facf	add cmd	2025-03-25 17:56:02 +08:00
y.qiu	a756fc1bf2	unroll	2025-03-10 09:10:32 +08:00
y.qiu	0f2391b255	omp optim	2025-03-02 12:47:11 +08:00
y.qiu	8448b07e17	update readme	2025-02-27 21:04:58 +08:00
y.qiu	b2652072ef	misc	2025-02-26 17:35:52 +08:00
y.qiu	972dcf7cb3	small fix	2025-02-22 17:21:42 +08:00
y.qiu	d0b8a6f3a2	refactor intrinsics api	2025-02-10 21:33:38 +08:00
y.qiu	80e63e7637	set valid opencv version	2025-01-21 21:02:02 +08:00
y.qiu	8a975e19c9	add todo	2025-01-21 20:48:01 +08:00
y.qiu	45971ed070	update readme	2024-12-27 22:05:26 +08:00
y.qiu	094ada9337	misc	2024-12-23 09:05:27 +08:00
y.qiu	d07f070668	add pos subpixel	2024-12-22 21:01:38 +08:00
y.qiu	a3b1b64e9a	fix: image size	2024-12-01 17:57:15 +08:00
y.qiu	d1e4abfc71	misc	2024-11-03 21:12:39 +08:00
y.qiu	cff37c2f08	fix: invalid size write	2024-11-03 20:51:33 +08:00
y.qiu	440c938b22	last miss fix	2024-10-08 21:08:15 +08:00
y.qiu	b6e2d9aba5	unified simd size	2024-10-08 20:39:30 +08:00
y.qiu	4a2f97e354	fix arm64	2024-09-27 21:11:19 +08:00
y.qiu	4b8ec4359c	optimal cache miss	2024-09-22 17:20:24 +08:00
y.qiu	d451b46535	misc	2024-09-21 09:16:15 +08:00
y.qiu	09017f0bc0	let integral as standalone	2024-09-20 22:50:25 +08:00
y.qiu	eea1b6320b	spilt vector and scalar code	2024-09-20 22:42:46 +08:00
y.qiu	b4cdaff8d1	misc	2024-09-20 17:55:29 +08:00
				`@ -0,0 +1 @@`
				`1. opencv intrinsics api [changed](https://github.com/opencv/opencv/pull/24371) since 4.9, need refactor`