109 lines
3.8 KiB
C++
109 lines
3.8 KiB
C++
#include "laplacian.h"
|
|
|
|
#include <immintrin.h>
|
|
|
|
double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
|
|
{
|
|
__m256 mean = _mm256_setzero_ps();
|
|
__m256 M2 = _mm256_setzero_ps();
|
|
uint32_t count = 0;
|
|
for(uint32_t y = 1; y < height - 1; y++)
|
|
{
|
|
uint32_t row = (y - 1) * width;
|
|
for(uint32_t x = 1; x < width - 8; x += 8)
|
|
{
|
|
__m128i p0 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + row + x));
|
|
__m128i p1 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x - 1));
|
|
__m128i p2 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x));
|
|
__m128i p3 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width) + x + 1));
|
|
__m128i p4 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(img + (row + width * 2) + x));
|
|
|
|
__m256i sum = _mm256_setzero_si256();
|
|
|
|
__m256i a;
|
|
a = _mm256_cvtepu16_epi32(p0);
|
|
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p0));
|
|
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p1));
|
|
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p3));
|
|
sum = _mm256_add_epi32(sum, _mm256_cvtepu16_epi32(p4));
|
|
sum = _mm256_sub_epi32(sum, _mm256_sll_epi32(_mm256_cvtepu16_epi32(p2), _mm_set1_epi64x(2)));
|
|
|
|
if(out)
|
|
{
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sum);
|
|
}
|
|
|
|
__m256 af = _mm256_cvtepi32_ps(sum);
|
|
|
|
count++;
|
|
__m256 delta = _mm256_sub_ps(af, mean);
|
|
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
|
|
__m256 delta2 = _mm256_sub_ps(af, mean);
|
|
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
|
|
}
|
|
}
|
|
float mean_2[8];
|
|
float M2_2[8];
|
|
_mm256_storeu_ps(mean_2, mean);
|
|
_mm256_storeu_ps(M2_2, M2);
|
|
|
|
auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
|
|
{
|
|
uint32_t count = 2 * n;
|
|
float delta = mean_2 - mean_1;
|
|
float mean = mean_1 + delta * ((float)n / count);
|
|
float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
|
|
mean_1 = mean;
|
|
M2_1 = M2;
|
|
};
|
|
|
|
/*for(int i = 0; i < 8; i++)
|
|
qDebug() << M2_2[i] / count;*/
|
|
|
|
welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
|
|
welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
|
|
welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
|
|
welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);
|
|
|
|
welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
|
|
welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);
|
|
|
|
welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);
|
|
|
|
return (double)M2_2[0] / (count * 8);
|
|
}
|
|
|
|
bool reflow(const cv::Mat &ref, const cv::Mat &img, cv::Mat &warped)
|
|
{
|
|
try
|
|
{
|
|
cv::Mat flow(ref.size(), CV_32FC2);
|
|
cv::calcOpticalFlowFarneback(ref, img, flow, 0.5, 3, 40, 3, 5, 1.2, 0);
|
|
|
|
std::vector<cv::Mat> flow_xy(2);
|
|
cv::split(flow, flow_xy);
|
|
cv::Mat flow_x = flow_xy[0];
|
|
cv::Mat flow_y = flow_xy[1];
|
|
|
|
// --- Build map_x and map_y for remapping
|
|
cv::Mat map_x(ref.size(), CV_32FC1);
|
|
cv::Mat map_y(ref.size(), CV_32FC1);
|
|
|
|
for (int y = 0; y < ref.rows; y++) {
|
|
for (int x = 0; x < ref.cols; x++) {
|
|
map_x.at<float>(y, x) = x + flow_x.at<float>(y, x);
|
|
map_y.at<float>(y, x) = y + flow_y.at<float>(y, x);
|
|
}
|
|
}
|
|
|
|
// --- Warp img1 to align it with img2 using the optical flow
|
|
cv::remap(img, warped, map_x, map_y, cv::INTER_LANCZOS4);
|
|
}
|
|
catch (...)
|
|
{
|
|
return false;
|
|
}
|
|
return true;
|
|
|
|
}
|