Files
kouryu/laplacian.cpp
T
2025-10-14 18:14:01 +02:00

109 lines
3.8 KiB
C++

#include "laplacian.h"
#include <immintrin.h>
double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
{
__m256 mean = _mm256_setzero_ps();
__m256 M2 = _mm256_setzero_ps();
uint32_t count = 0;
for(uint32_t y = 1; y < height - 1; y++)
{
uint32_t row = (y - 1) * width;
for(uint32_t x = 1; x < width - 8; x += 8)
{
__m128i p0 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + row + x));
__m128i p1 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x - 1));
__m128i p2 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x));
__m128i p3 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x + 1));
__m128i p4 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width * 2) + x));
__m256i sum = _mm256_setzero_si256();
__m256i a;
a = _mm256_cvtepu16_epi32(p0);
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p0));
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p1));
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p3));
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p4));
sum = _mm256_sub_epi32(sum, _mm256_sll_epi32(_mm256_cvtepu16_epi32(p2), _mm_set1_epi64x(2)));
if(out)
{
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sum);
}
__m256 af = _mm256_cvtepi32_ps(sum);
count++;
__m256 delta = _mm256_sub_ps(af, mean);
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
__m256 delta2 = _mm256_sub_ps(af, mean);
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
}
}
float mean_2[8];
float M2_2[8];
_mm256_storeu_ps(mean_2, mean);
_mm256_storeu_ps(M2_2, M2);
auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
{
uint32_t count = 2 * n;
float delta = mean_2 - mean_1;
float mean = mean_1 + delta * ((float)n / count);
float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
mean_1 = mean;
M2_1 = M2;
};
/*for(int i = 0; i < 8; i++)
qDebug() << M2_2[i] / count;*/
welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);
welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);
welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);
return (double)M2_2[0] / (count * 8);
}
bool reflow(const cv::Mat &ref, const cv::Mat &img, cv::Mat &warped)
{
try
{
cv::Mat flow(ref.size(), CV_32FC2);
cv::calcOpticalFlowFarneback(ref, img, flow, 0.5, 3, 40, 3, 5, 1.2, 0);
std::vector<cv::Mat> flow_xy(2);
cv::split(flow, flow_xy);
cv::Mat flow_x = flow_xy[0];
cv::Mat flow_y = flow_xy[1];
// --- Build map_x and map_y for remapping
cv::Mat map_x(ref.size(), CV_32FC1);
cv::Mat map_y(ref.size(), CV_32FC1);
for (int y = 0; y < ref.rows; y++) {
for (int x = 0; x < ref.cols; x++) {
map_x.at<float>(y, x) = x + flow_x.at<float>(y, x);
map_y.at<float>(y, x) = y + flow_y.at<float>(y, x);
}
}
// --- Warp img1 to align it with img2 using the optical flow
cv::remap(img, warped, map_x, map_y, cv::INTER_LANCZOS4);
}
catch (...)
{
return false;
}
return true;
}