#include "laplacian.h" #include double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height) { __m256 mean = _mm256_setzero_ps(); __m256 M2 = _mm256_setzero_ps(); uint32_t count = 0; for(uint32_t y = 1; y < height - 1; y++) { uint32_t row = (y - 1) * width; for(uint32_t x = 1; x < width - 8; x += 8) { __m128i p0 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + row + x)); __m128i p1 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x - 1)); __m128i p2 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x)); __m128i p3 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x + 1)); __m128i p4 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width * 2) + x)); __m256i sum = _mm256_setzero_si256(); __m256i a; a = _mm256_cvtepu16_epi32(p0); sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p0)); sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p1)); sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p3)); sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p4)); sum = _mm256_sub_epi32(sum, _mm256_sll_epi32(_mm256_cvtepu16_epi32(p2), _mm_set1_epi64x(2))); if(out) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sum); } __m256 af = _mm256_cvtepi32_ps(sum); count++; __m256 delta = _mm256_sub_ps(af, mean); mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast(count)))); __m256 delta2 = _mm256_sub_ps(af, mean); M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2)); } } float mean_2[8]; float M2_2[8]; _mm256_storeu_ps(mean_2, mean); _mm256_storeu_ps(M2_2, M2); auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2) { uint32_t count = 2 * n; float delta = mean_2 - mean_1; float mean = mean_1 + delta * ((float)n / count); float M2 = M2_1 + M2_2 + delta * delta * n * n / count; mean_1 = mean; M2_1 = M2; }; /*for(int i = 0; i < 8; i++) qDebug() << M2_2[i] / count;*/ welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]); welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]); welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]); welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]); welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]); welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]); welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]); return (double)M2_2[0] / (count * 8); } bool reflow(const cv::Mat &ref, const cv::Mat &img, cv::Mat &warped) { try { cv::Mat flow(ref.size(), CV_32FC2); cv::calcOpticalFlowFarneback(ref, img, flow, 0.5, 3, 40, 3, 5, 1.2, 0); std::vector flow_xy(2); cv::split(flow, flow_xy); cv::Mat flow_x = flow_xy[0]; cv::Mat flow_y = flow_xy[1]; // --- Build map_x and map_y for remapping cv::Mat map_x(ref.size(), CV_32FC1); cv::Mat map_y(ref.size(), CV_32FC1); for (int y = 0; y < ref.rows; y++) { for (int x = 0; x < ref.cols; x++) { map_x.at(y, x) = x + flow_x.at(y, x); map_y.at(y, x) = y + flow_y.at(y, x); } } // --- Warp img1 to align it with img2 using the optical flow cv::remap(img, warped, map_x, map_y, cv::INTER_LANCZOS4); } catch (...) { return false; } return true; }