kouryu/laplacian.cpp

#include "laplacian.h"

#include <immintrin.h>

double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
{
    __m256 mean = _mm256_setzero_ps();
    __m256 M2 = _mm256_setzero_ps();
    uint32_t count = 0;
    for(uint32_t y = 1; y < height - 1; y++)
    {
        uint32_t row = (y - 1) * width;
        for(uint32_t x = 1; x < width - 8; x += 8)
        {
            __m128i p0 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + row + x));
            __m128i p1 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x - 1));
            __m128i p2 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x));
            __m128i p3 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x + 1));
            __m128i p4 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width * 2) + x));

            __m256i sum = _mm256_setzero_si256();

            __m256i a;
            a = _mm256_cvtepu16_epi32(p0);
            sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p0));
            sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p1));
            sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p3));
            sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p4));
            sum = _mm256_sub_epi32(sum, _mm256_sll_epi32(_mm256_cvtepu16_epi32(p2), _mm_set1_epi64x(2)));

            if(out)
            {
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sum);
            }

            __m256 af = _mm256_cvtepi32_ps(sum);

            count++;
            __m256 delta = _mm256_sub_ps(af, mean);
            mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
            __m256 delta2 = _mm256_sub_ps(af, mean);
            M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
        }
    }
    float mean_2[8];
    float M2_2[8];
    _mm256_storeu_ps(mean_2, mean);
    _mm256_storeu_ps(M2_2, M2);

    auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
    {
        uint32_t count = 2 * n;
        float delta = mean_2 - mean_1;
        float mean = mean_1  + delta * ((float)n / count);
        float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
        mean_1 = mean;
        M2_1 = M2;
    };

    /*for(int i = 0; i < 8; i++)
        qDebug() << M2_2[i] / count;*/

    welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
    welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
    welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
    welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);

    welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
    welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);

    welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);

    return (double)M2_2[0] / (count * 8);
}

bool reflow(const cv::Mat &ref, const cv::Mat &img, cv::Mat &warped)
{
    try
    {
        cv::Mat flow(ref.size(), CV_32FC2);
        cv::calcOpticalFlowFarneback(ref, img, flow, 0.5, 3, 40, 3, 5, 1.2, 0);

        std::vector<cv::Mat> flow_xy(2);
        cv::split(flow, flow_xy);
        cv::Mat flow_x = flow_xy[0];
        cv::Mat flow_y = flow_xy[1];

        // --- Build map_x and map_y for remapping
        cv::Mat map_x(ref.size(), CV_32FC1);
        cv::Mat map_y(ref.size(), CV_32FC1);

        for (int y = 0; y < ref.rows; y++) {
            for (int x = 0; x < ref.cols; x++) {
                map_x.at<float>(y, x) = x + flow_x.at<float>(y, x);
                map_y.at<float>(y, x) = y + flow_y.at<float>(y, x);
            }
        }

        // --- Warp img1 to align it with img2 using the optical flow
        cv::remap(img, warped, map_x, map_y, cv::INTER_LANCZOS4);
    }
    catch (...)
    {
        return false;
    }
    return true;

}