AVX laplacian operator

2025-10-12 13:35:22 +02:00
commit 15fbb4ae33
8 changed files with 829 additions and 0 deletions
@@ -0,0 +1,410 @@
+#include "mainwindow.h"
+
+#include "serfile.h"
+#include <QApplication>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <complex>
+#include <cmath>
+
+#include <immintrin.h>
+#include <vector>
+#include <complex>
+#include <cassert>
+
+// Twiddle factor struct
+struct Twiddle {
+    float re;
+    float im;
+};
+
+// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
+inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
+    // (are + i aim) * (bre + i bim)
+    __m256 ac = _mm256_mul_ps(are, bre);
+    __m256 bd = _mm256_mul_ps(aim, bim);
+    __m256 ad = _mm256_mul_ps(are, bim);
+    __m256 bc = _mm256_mul_ps(aim, bre);
+    // real = ac - bd
+    __m256 real = _mm256_sub_ps(ac, bd);
+    // imag = ad + bc
+    __m256 imag = _mm256_add_ps(ad, bc);
+
+    // We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
+    // But because we process 4 complex numbers in the vector, we need shuffling
+    // to interleave real and imag properly. But for simplicity, assume the calling
+    // code expects separate real & imag vectors, or we’ll implement interleaving.
+    // Here, return real in lower 128 bits & imag in upper, or some scheme.
+    // For clarity: return real in output lane 0‑127, imag in 128‑255.
+    // But better is to use separate vectors for real & imag, or AoS with shuffles.
+
+    // Pack: in low half real, in high half imag
+    return _mm256_blend_ps(real, imag, 0xF0);
+    // 0xF0 = upper 4 lanes from imag
+}
+
+// Precompute twiddles
+static std::vector<Twiddle> make_twiddles(int N) {
+    std::vector<Twiddle> W(N/2);
+    const float PI = std::acos(-1.0f);
+    for(int k = 0; k < N/2; ++k) {
+        float angle = -2.0f * PI * k / N;
+        W[k].re = std::cos(angle);
+        W[k].im = std::sin(angle);
+    }
+    return W;
+}
+
+// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
+void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
+                       int N, bool inverse = false)
+{
+    assert((N & (N - 1)) == 0); // power of two
+    auto W = make_twiddles(N);
+    const float inv_sign = inverse ? +1.0f : -1.0f;
+
+    std::complex<float>* in = data;
+    std::complex<float>* out = temp;
+
+    int logN = 0;
+    while ((1 << logN) < N) ++logN;
+
+    for(int stage = 0; stage < logN; ++stage) {
+        int m = 1 << (stage + 1);
+        int half_m = m >> 1;
+        // stride between groups
+        int group_stride = N / m;
+
+        for(int k = 0; k < N; k += m) {
+            for(int j = 0; j < half_m; ++j) {
+                // twiddle W_index:
+                int w_index = j * group_stride;
+                float w_re = W[w_index].re;
+                float w_im = inv_sign * W[w_index].im;  // invert sign for inverse
+
+                // Load w_re, w_im (we can broadcast them)
+                __m256 w_re_b = _mm256_set1_ps(w_re);
+                __m256 w_im_b = _mm256_set1_ps(w_im);
+
+                // Process 4 complex numbers at once in the j position strides
+                // The 4 complex numbers are from positions:
+                // in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
+                // But that depends on how many half_m, whether half_m >=4 etc.
+                // For simplicity, require half_m >=4 in vectorized branch.
+
+                if (half_m >= 4 && (j + 3*half_m) < N) {
+                    // Load real parts
+                    float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
+                    float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
+                    // Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
+                    // We need gather 4 complex u's and t's with step half_m.
+
+                    // Load u (4 complex): u0, u1, u2, u3
+                    __m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
+                    __m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));
+
+                    // Complex multiply t0 * w
+                    // Split t0 into re, im
+                    __m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
+                    __m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im
+
+                    __m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);
+
+                    // Now compute:
+                    // out[k/2 + j + 0] = u + t * w
+                    // out[k/2 + j + N/2 + j] = u - t * w
+                    // But with vector, we do elementwise addition/subtraction
+
+                    __m256 sum = _mm256_add_ps(u0, mul);
+                    __m256 diff = _mm256_sub_ps(u0, mul);
+
+                    // Store sum and diff to their respective locations in out
+                    // Need to compute positions:
+
+                    // Position for “sum”:
+                    std::complex<float>* out_sum = &out[k/2 + j];
+                    std::complex<float>* out_diff = &out[k/2 + j + N/2];
+
+                    // Store
+                    _mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
+                    _mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);
+
+                } else {
+                    // Fallback scalar for j's not fitting vectorization
+                    auto u = in[k + j];
+                    auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);
+
+                    out[k/2 + j] = u + t;
+                    out[k/2 + j + N/2] = u - t;
+                }
+            }
+        }
+
+        // Swap in/out buffers
+        std::swap(in, out);
+    }
+
+    // If number of stages is odd, data is currently in temp
+    if (logN & 1) {
+        for(int i = 0; i < N; ++i)
+            data[i] = in[i];
+    }
+
+    // Normalize for inverse
+    if (inverse) {
+        float invN = 1.0f / N;
+        for(int i = 0; i < N; ++i) {
+            data[i] *= invN;
+        }
+    }
+}
+
+
+void fft(std::vector<std::complex<float>> &x, bool inv = false)
+{
+    const size_t N = x.size();
+    if (N <= 1) return;
+
+    // Bit-reversed addressing permutation
+    size_t j = 0;
+    for(size_t i = 1; i < N; ++i)
+    {
+        size_t bit = N >> 1;
+        while(j & bit)
+        {
+            j ^= bit;
+            bit >>= 1;
+        }
+        j ^= bit;
+
+        if (i < j)
+        {
+            std::swap(x[i], x[j]);
+        }
+    }
+
+    // Iterative FFT
+    for(size_t len = 2; len <= N; len <<= 1)
+    {
+        double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
+        std::complex<float> wlen(std::cos(angle), std::sin(angle));
+        for(size_t i = 0; i < N; i += len)
+        {
+            std::complex<float> w(1);
+            for(size_t j = 0; j < len / 2; ++j)
+            {
+                std::complex<float> u = x[i + j];
+                std::complex<float> v = x[i + j + len / 2] * w;
+                x[i + j] = u + v;
+                x[i + j + len / 2] = u - v;
+                w *= wlen;
+            }
+        }
+    }
+
+    if(inv)
+    {
+        for(size_t i = 0; i < N; i++)
+            x[i] /= N;
+    }
+}
+
+double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
+{
+    __m256 mean = _mm256_setzero_ps();
+    __m256 M2 = _mm256_setzero_ps();
+    uint32_t count = 0;
+    for(uint32_t y = 1; y < height - 1; y++)
+    {
+        uint32_t row = (y - 1) * width;
+        for(uint32_t x = 1; x < width - 17; x += 16)
+        {
+            __m256i p0 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + row + x));
+            __m256i p1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x - 1));
+            __m256i p2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x));
+            __m256i p3 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x + 1));
+            __m256i p4 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width * 2) + x));
+
+            __m256i sumA = _mm256_setzero_si256();
+            __m256i sumB = _mm256_setzero_si256();
+
+            __m256i a,b;
+            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 0));
+            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 1));
+            sumA = _mm256_add_epi32(sumA, a);
+            sumB = _mm256_add_epi32(sumB, b);
+
+            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 0));
+            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 1));
+            sumA = _mm256_add_epi32(sumA, a);
+            sumB = _mm256_add_epi32(sumB, b);
+
+            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 0));
+            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 1));
+            a = _mm256_sll_epi32(a, _mm_set1_epi64x(2));
+            b = _mm256_sll_epi32(b, _mm_set1_epi64x(2));
+            sumA = _mm256_sub_epi32(sumA, a);
+            sumB = _mm256_sub_epi32(sumB, b);
+
+            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 0));
+            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 1));
+            sumA = _mm256_add_epi32(sumA, a);
+            sumB = _mm256_add_epi32(sumB, b);
+
+            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 0));
+            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 1));
+            sumA = _mm256_add_epi32(sumA, a);
+            sumB = _mm256_add_epi32(sumB, b);
+
+            if(out)
+            {
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sumA);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x + 8), sumB);
+            }
+
+            __m256 af = _mm256_cvtepi32_ps(sumA);
+            __m256 bf = _mm256_cvtepi32_ps(sumB);
+
+            count++;
+            __m256 delta = _mm256_sub_ps(af, mean);
+            mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
+            __m256 delta2 = _mm256_sub_ps(af, mean);
+            M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
+
+            count++;
+            delta = _mm256_sub_ps(bf, mean);
+            mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
+            delta2 = _mm256_sub_ps(bf, mean);
+            M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
+
+            //count += 1
+            //delta = new_value - mean
+            //mean += delta / count
+            //delta2 = new_value - mean
+            //M2 += delta * delta2
+        }
+    }
+    float mean_2[8];
+    float M2_2[8];
+    _mm256_storeu_ps(mean_2, mean);
+    _mm256_storeu_ps(M2_2, M2);
+
+    auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
+    {
+        uint32_t count = 2 * n;
+        float delta = mean_2 - mean_1;
+        float mean = mean_1  + delta * ((float)n / count);
+        float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
+        mean_1 = mean;
+        M2_1 = M2;
+    };
+
+    for(int i = 0; i < 8; i++)
+        qDebug() << M2_2[i] / count;
+
+    welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
+    welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
+    welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
+    welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);
+
+    welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
+    welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);
+
+    welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);
+
+    return (double)M2_2[0] / (count * 8);
+}
+
+int main(int argc, char *argv[])
+{
+    SERFileReader ser;
+    ser.open("/home/nou/.wine/drive_c/indi_2025-10-03/indi_record_2025-10-03@18-24-37.ser");
+
+    cv::Rect rect(1024, 1024, 128, 128);
+    double maxQ = 0;
+    cv::Mat best;
+    cv::Mat lap;
+    cv::Mat first(ser.height(), ser.width(), CV_16U);
+    cv::Mat img(ser.height(), ser.width(), CV_16U);
+    cv::Mat out(ser.height(), ser.width(), CV_32S);
+    cv::Mat imgf32;
+    ser.getFrame(0, (char*)first.data);
+    first.convertTo(first, CV_32F);
+    for(uint32_t i = 0; i < ser.frameCount(); i++)
+    {
+        ser.getFrame(i, (char*)img.data);
+        double var = laplacian((uint16_t*)img.data, (int32_t*)out.data, img.cols, img.rows);
+        double minval, maxval;
+        cv::minMaxLoc(out, &minval, &maxval);
+        out.convertTo(out, CV_32F, 1.0 / (maxval - minval), -minval / (maxval - minval));
+        qDebug() << "minmax" << minval << maxval;
+        cv::imshow("lap", out);
+
+        img.convertTo(imgf32, CV_32F);
+        cv::Laplacian(imgf32, lap, CV_32F, 1);
+        cv::minMaxLoc(lap, &minval, &maxval);
+        qDebug() << "minmax" << minval << maxval;
+        cv::Mat stddev;
+        cv::Mat mean;
+        cv::meanStdDev(lap, mean, stddev);
+        lap -= minval;
+        lap /= (maxval - minval);
+        cv::imshow("lapcv", lap);
+        cv::waitKey();
+        qDebug() << var << std::sqrt(var) << stddev.at<double>(0);
+        //continue;
+        return 0;
+
+        img.convertTo(imgf32, CV_32F);
+        cv::Laplacian(imgf32, lap, CV_32F, 1);
+        cv::Point2d off = cv::phaseCorrelate(first(rect), imgf32(rect));
+        if(maxQ < stddev.at<double>(0))
+        {
+            maxQ = stddev.at<double>(0);
+            img.copyTo(best);
+            //qDebug() << "new best" << i;
+        }
+    }
+    cv::imshow("lap", best);
+    cv::waitKeyEx();
+
+    return 0;
+
+
+    cv::Mat img1= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/R.tif", cv::IMREAD_GRAYSCALE);
+    cv::Mat img2= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/G.tif", cv::IMREAD_GRAYSCALE);
+
+    img1.convertTo(img1, CV_32F);
+    img2.convertTo(img2, CV_32F);
+
+    cv::Point2d point = cv::phaseCorrelate(img1, img2);
+
+
+    cv::Mat img2dst;
+    cv::Mat t(2, 3, CV_32F);
+    t.at<float>(0, 0) = 1.0;
+    t.at<float>(1, 1) = 1.0;
+    t.at<float>(0, 2) = -point.x;
+    t.at<float>(1, 2) = -point.y;
+    cv::warpAffine(img2, img2dst, t, img1.size());
+
+    //cv::imshow("img1", img1 / 64.0);
+    auto diff = img1 - img2dst;
+    double min, max;
+    cv::minMaxLoc(diff, &min, &max);
+    qDebug()  << min << max;
+    cv::imshow("img2", (diff - min) / (32));
+    cv::imwrite("diff.png", (diff - min) / (max - min) * 255);
+    cv::imwrite("avg.png", (img1 + img2dst) * 0.5);
+
+    cv::minMaxLoc(img1, &min, &max);
+    qDebug()  << min << max;
+    cv::waitKey();
+
+
+    return 0;
+    /*QApplication a(argc, argv);
+    MainWindow w;
+    w.show();
+    return a.exec();*/
+}