AVX laplacian operator
This commit is contained in:
@@ -0,0 +1,410 @@
|
||||
#include "mainwindow.h"
|
||||
|
||||
#include "serfile.h"
|
||||
#include <QApplication>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
#include <cmath>
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <vector>
|
||||
#include <complex>
|
||||
#include <cassert>
|
||||
|
||||
// Twiddle factor struct
|
||||
struct Twiddle {
|
||||
float re;
|
||||
float im;
|
||||
};
|
||||
|
||||
// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
|
||||
inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
|
||||
// (are + i aim) * (bre + i bim)
|
||||
__m256 ac = _mm256_mul_ps(are, bre);
|
||||
__m256 bd = _mm256_mul_ps(aim, bim);
|
||||
__m256 ad = _mm256_mul_ps(are, bim);
|
||||
__m256 bc = _mm256_mul_ps(aim, bre);
|
||||
// real = ac - bd
|
||||
__m256 real = _mm256_sub_ps(ac, bd);
|
||||
// imag = ad + bc
|
||||
__m256 imag = _mm256_add_ps(ad, bc);
|
||||
|
||||
// We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
|
||||
// But because we process 4 complex numbers in the vector, we need shuffling
|
||||
// to interleave real and imag properly. But for simplicity, assume the calling
|
||||
// code expects separate real & imag vectors, or we’ll implement interleaving.
|
||||
// Here, return real in lower 128 bits & imag in upper, or some scheme.
|
||||
// For clarity: return real in output lane 0‑127, imag in 128‑255.
|
||||
// But better is to use separate vectors for real & imag, or AoS with shuffles.
|
||||
|
||||
// Pack: in low half real, in high half imag
|
||||
return _mm256_blend_ps(real, imag, 0xF0);
|
||||
// 0xF0 = upper 4 lanes from imag
|
||||
}
|
||||
|
||||
// Precompute twiddles
|
||||
static std::vector<Twiddle> make_twiddles(int N) {
|
||||
std::vector<Twiddle> W(N/2);
|
||||
const float PI = std::acos(-1.0f);
|
||||
for(int k = 0; k < N/2; ++k) {
|
||||
float angle = -2.0f * PI * k / N;
|
||||
W[k].re = std::cos(angle);
|
||||
W[k].im = std::sin(angle);
|
||||
}
|
||||
return W;
|
||||
}
|
||||
|
||||
// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
|
||||
void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
|
||||
int N, bool inverse = false)
|
||||
{
|
||||
assert((N & (N - 1)) == 0); // power of two
|
||||
auto W = make_twiddles(N);
|
||||
const float inv_sign = inverse ? +1.0f : -1.0f;
|
||||
|
||||
std::complex<float>* in = data;
|
||||
std::complex<float>* out = temp;
|
||||
|
||||
int logN = 0;
|
||||
while ((1 << logN) < N) ++logN;
|
||||
|
||||
for(int stage = 0; stage < logN; ++stage) {
|
||||
int m = 1 << (stage + 1);
|
||||
int half_m = m >> 1;
|
||||
// stride between groups
|
||||
int group_stride = N / m;
|
||||
|
||||
for(int k = 0; k < N; k += m) {
|
||||
for(int j = 0; j < half_m; ++j) {
|
||||
// twiddle W_index:
|
||||
int w_index = j * group_stride;
|
||||
float w_re = W[w_index].re;
|
||||
float w_im = inv_sign * W[w_index].im; // invert sign for inverse
|
||||
|
||||
// Load w_re, w_im (we can broadcast them)
|
||||
__m256 w_re_b = _mm256_set1_ps(w_re);
|
||||
__m256 w_im_b = _mm256_set1_ps(w_im);
|
||||
|
||||
// Process 4 complex numbers at once in the j position strides
|
||||
// The 4 complex numbers are from positions:
|
||||
// in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
|
||||
// But that depends on how many half_m, whether half_m >=4 etc.
|
||||
// For simplicity, require half_m >=4 in vectorized branch.
|
||||
|
||||
if (half_m >= 4 && (j + 3*half_m) < N) {
|
||||
// Load real parts
|
||||
float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
|
||||
float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
|
||||
// Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
|
||||
// We need gather 4 complex u's and t's with step half_m.
|
||||
|
||||
// Load u (4 complex): u0, u1, u2, u3
|
||||
__m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
|
||||
__m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));
|
||||
|
||||
// Complex multiply t0 * w
|
||||
// Split t0 into re, im
|
||||
__m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
|
||||
__m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im
|
||||
|
||||
__m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);
|
||||
|
||||
// Now compute:
|
||||
// out[k/2 + j + 0] = u + t * w
|
||||
// out[k/2 + j + N/2 + j] = u - t * w
|
||||
// But with vector, we do elementwise addition/subtraction
|
||||
|
||||
__m256 sum = _mm256_add_ps(u0, mul);
|
||||
__m256 diff = _mm256_sub_ps(u0, mul);
|
||||
|
||||
// Store sum and diff to their respective locations in out
|
||||
// Need to compute positions:
|
||||
|
||||
// Position for “sum”:
|
||||
std::complex<float>* out_sum = &out[k/2 + j];
|
||||
std::complex<float>* out_diff = &out[k/2 + j + N/2];
|
||||
|
||||
// Store
|
||||
_mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
|
||||
_mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);
|
||||
|
||||
} else {
|
||||
// Fallback scalar for j's not fitting vectorization
|
||||
auto u = in[k + j];
|
||||
auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);
|
||||
|
||||
out[k/2 + j] = u + t;
|
||||
out[k/2 + j + N/2] = u - t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Swap in/out buffers
|
||||
std::swap(in, out);
|
||||
}
|
||||
|
||||
// If number of stages is odd, data is currently in temp
|
||||
if (logN & 1) {
|
||||
for(int i = 0; i < N; ++i)
|
||||
data[i] = in[i];
|
||||
}
|
||||
|
||||
// Normalize for inverse
|
||||
if (inverse) {
|
||||
float invN = 1.0f / N;
|
||||
for(int i = 0; i < N; ++i) {
|
||||
data[i] *= invN;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void fft(std::vector<std::complex<float>> &x, bool inv = false)
|
||||
{
|
||||
const size_t N = x.size();
|
||||
if (N <= 1) return;
|
||||
|
||||
// Bit-reversed addressing permutation
|
||||
size_t j = 0;
|
||||
for(size_t i = 1; i < N; ++i)
|
||||
{
|
||||
size_t bit = N >> 1;
|
||||
while(j & bit)
|
||||
{
|
||||
j ^= bit;
|
||||
bit >>= 1;
|
||||
}
|
||||
j ^= bit;
|
||||
|
||||
if (i < j)
|
||||
{
|
||||
std::swap(x[i], x[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterative FFT
|
||||
for(size_t len = 2; len <= N; len <<= 1)
|
||||
{
|
||||
double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
|
||||
std::complex<float> wlen(std::cos(angle), std::sin(angle));
|
||||
for(size_t i = 0; i < N; i += len)
|
||||
{
|
||||
std::complex<float> w(1);
|
||||
for(size_t j = 0; j < len / 2; ++j)
|
||||
{
|
||||
std::complex<float> u = x[i + j];
|
||||
std::complex<float> v = x[i + j + len / 2] * w;
|
||||
x[i + j] = u + v;
|
||||
x[i + j + len / 2] = u - v;
|
||||
w *= wlen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(inv)
|
||||
{
|
||||
for(size_t i = 0; i < N; i++)
|
||||
x[i] /= N;
|
||||
}
|
||||
}
|
||||
|
||||
double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
|
||||
{
|
||||
__m256 mean = _mm256_setzero_ps();
|
||||
__m256 M2 = _mm256_setzero_ps();
|
||||
uint32_t count = 0;
|
||||
for(uint32_t y = 1; y < height - 1; y++)
|
||||
{
|
||||
uint32_t row = (y - 1) * width;
|
||||
for(uint32_t x = 1; x < width - 17; x += 16)
|
||||
{
|
||||
__m256i p0 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + row + x));
|
||||
__m256i p1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x - 1));
|
||||
__m256i p2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x));
|
||||
__m256i p3 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x + 1));
|
||||
__m256i p4 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width * 2) + x));
|
||||
|
||||
__m256i sumA = _mm256_setzero_si256();
|
||||
__m256i sumB = _mm256_setzero_si256();
|
||||
|
||||
__m256i a,b;
|
||||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 0));
|
||||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 1));
|
||||
sumA = _mm256_add_epi32(sumA, a);
|
||||
sumB = _mm256_add_epi32(sumB, b);
|
||||
|
||||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 0));
|
||||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 1));
|
||||
sumA = _mm256_add_epi32(sumA, a);
|
||||
sumB = _mm256_add_epi32(sumB, b);
|
||||
|
||||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 0));
|
||||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 1));
|
||||
a = _mm256_sll_epi32(a, _mm_set1_epi64x(2));
|
||||
b = _mm256_sll_epi32(b, _mm_set1_epi64x(2));
|
||||
sumA = _mm256_sub_epi32(sumA, a);
|
||||
sumB = _mm256_sub_epi32(sumB, b);
|
||||
|
||||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 0));
|
||||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 1));
|
||||
sumA = _mm256_add_epi32(sumA, a);
|
||||
sumB = _mm256_add_epi32(sumB, b);
|
||||
|
||||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 0));
|
||||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 1));
|
||||
sumA = _mm256_add_epi32(sumA, a);
|
||||
sumB = _mm256_add_epi32(sumB, b);
|
||||
|
||||
if(out)
|
||||
{
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sumA);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x + 8), sumB);
|
||||
}
|
||||
|
||||
__m256 af = _mm256_cvtepi32_ps(sumA);
|
||||
__m256 bf = _mm256_cvtepi32_ps(sumB);
|
||||
|
||||
count++;
|
||||
__m256 delta = _mm256_sub_ps(af, mean);
|
||||
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
|
||||
__m256 delta2 = _mm256_sub_ps(af, mean);
|
||||
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
|
||||
|
||||
count++;
|
||||
delta = _mm256_sub_ps(bf, mean);
|
||||
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
|
||||
delta2 = _mm256_sub_ps(bf, mean);
|
||||
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
|
||||
|
||||
//count += 1
|
||||
//delta = new_value - mean
|
||||
//mean += delta / count
|
||||
//delta2 = new_value - mean
|
||||
//M2 += delta * delta2
|
||||
}
|
||||
}
|
||||
float mean_2[8];
|
||||
float M2_2[8];
|
||||
_mm256_storeu_ps(mean_2, mean);
|
||||
_mm256_storeu_ps(M2_2, M2);
|
||||
|
||||
auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
|
||||
{
|
||||
uint32_t count = 2 * n;
|
||||
float delta = mean_2 - mean_1;
|
||||
float mean = mean_1 + delta * ((float)n / count);
|
||||
float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
|
||||
mean_1 = mean;
|
||||
M2_1 = M2;
|
||||
};
|
||||
|
||||
for(int i = 0; i < 8; i++)
|
||||
qDebug() << M2_2[i] / count;
|
||||
|
||||
welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
|
||||
welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
|
||||
welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
|
||||
welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);
|
||||
|
||||
welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
|
||||
welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);
|
||||
|
||||
welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);
|
||||
|
||||
return (double)M2_2[0] / (count * 8);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
SERFileReader ser;
|
||||
ser.open("/home/nou/.wine/drive_c/indi_2025-10-03/indi_record_2025-10-03@18-24-37.ser");
|
||||
|
||||
cv::Rect rect(1024, 1024, 128, 128);
|
||||
double maxQ = 0;
|
||||
cv::Mat best;
|
||||
cv::Mat lap;
|
||||
cv::Mat first(ser.height(), ser.width(), CV_16U);
|
||||
cv::Mat img(ser.height(), ser.width(), CV_16U);
|
||||
cv::Mat out(ser.height(), ser.width(), CV_32S);
|
||||
cv::Mat imgf32;
|
||||
ser.getFrame(0, (char*)first.data);
|
||||
first.convertTo(first, CV_32F);
|
||||
for(uint32_t i = 0; i < ser.frameCount(); i++)
|
||||
{
|
||||
ser.getFrame(i, (char*)img.data);
|
||||
double var = laplacian((uint16_t*)img.data, (int32_t*)out.data, img.cols, img.rows);
|
||||
double minval, maxval;
|
||||
cv::minMaxLoc(out, &minval, &maxval);
|
||||
out.convertTo(out, CV_32F, 1.0 / (maxval - minval), -minval / (maxval - minval));
|
||||
qDebug() << "minmax" << minval << maxval;
|
||||
cv::imshow("lap", out);
|
||||
|
||||
img.convertTo(imgf32, CV_32F);
|
||||
cv::Laplacian(imgf32, lap, CV_32F, 1);
|
||||
cv::minMaxLoc(lap, &minval, &maxval);
|
||||
qDebug() << "minmax" << minval << maxval;
|
||||
cv::Mat stddev;
|
||||
cv::Mat mean;
|
||||
cv::meanStdDev(lap, mean, stddev);
|
||||
lap -= minval;
|
||||
lap /= (maxval - minval);
|
||||
cv::imshow("lapcv", lap);
|
||||
cv::waitKey();
|
||||
qDebug() << var << std::sqrt(var) << stddev.at<double>(0);
|
||||
//continue;
|
||||
return 0;
|
||||
|
||||
img.convertTo(imgf32, CV_32F);
|
||||
cv::Laplacian(imgf32, lap, CV_32F, 1);
|
||||
cv::Point2d off = cv::phaseCorrelate(first(rect), imgf32(rect));
|
||||
if(maxQ < stddev.at<double>(0))
|
||||
{
|
||||
maxQ = stddev.at<double>(0);
|
||||
img.copyTo(best);
|
||||
//qDebug() << "new best" << i;
|
||||
}
|
||||
}
|
||||
cv::imshow("lap", best);
|
||||
cv::waitKeyEx();
|
||||
|
||||
return 0;
|
||||
|
||||
|
||||
cv::Mat img1= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/R.tif", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat img2= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/G.tif", cv::IMREAD_GRAYSCALE);
|
||||
|
||||
img1.convertTo(img1, CV_32F);
|
||||
img2.convertTo(img2, CV_32F);
|
||||
|
||||
cv::Point2d point = cv::phaseCorrelate(img1, img2);
|
||||
|
||||
|
||||
cv::Mat img2dst;
|
||||
cv::Mat t(2, 3, CV_32F);
|
||||
t.at<float>(0, 0) = 1.0;
|
||||
t.at<float>(1, 1) = 1.0;
|
||||
t.at<float>(0, 2) = -point.x;
|
||||
t.at<float>(1, 2) = -point.y;
|
||||
cv::warpAffine(img2, img2dst, t, img1.size());
|
||||
|
||||
//cv::imshow("img1", img1 / 64.0);
|
||||
auto diff = img1 - img2dst;
|
||||
double min, max;
|
||||
cv::minMaxLoc(diff, &min, &max);
|
||||
qDebug() << min << max;
|
||||
cv::imshow("img2", (diff - min) / (32));
|
||||
cv::imwrite("diff.png", (diff - min) / (max - min) * 255);
|
||||
cv::imwrite("avg.png", (img1 + img2dst) * 0.5);
|
||||
|
||||
cv::minMaxLoc(img1, &min, &max);
|
||||
qDebug() << min << max;
|
||||
cv::waitKey();
|
||||
|
||||
|
||||
return 0;
|
||||
/*QApplication a(argc, argv);
|
||||
MainWindow w;
|
||||
w.show();
|
||||
return a.exec();*/
|
||||
}
|
||||
Reference in New Issue
Block a user