kouryu/main.cpp

#include "mainwindow.h"

#include "serfile.h"
#include <QApplication>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <complex>
#include <cmath>

#include <immintrin.h>
#include <vector>
#include <complex>
#include <cassert>

// Twiddle factor struct
struct Twiddle {
    float re;
    float im;
};

// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
    // (are + i aim) * (bre + i bim)
    __m256 ac = _mm256_mul_ps(are, bre);
    __m256 bd = _mm256_mul_ps(aim, bim);
    __m256 ad = _mm256_mul_ps(are, bim);
    __m256 bc = _mm256_mul_ps(aim, bre);
    // real = ac - bd
    __m256 real = _mm256_sub_ps(ac, bd);
    // imag = ad + bc
    __m256 imag = _mm256_add_ps(ad, bc);

    // We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
    // But because we process 4 complex numbers in the vector, we need shuffling
    // to interleave real and imag properly. But for simplicity, assume the calling
    // code expects separate real & imag vectors, or we’ll implement interleaving.
    // Here, return real in lower 128 bits & imag in upper, or some scheme.
    // For clarity: return real in output lane 0‑127, imag in 128‑255.
    // But better is to use separate vectors for real & imag, or AoS with shuffles.

    // Pack: in low half real, in high half imag
    return _mm256_blend_ps(real, imag, 0xF0);
    // 0xF0 = upper 4 lanes from imag
}

// Precompute twiddles
static std::vector<Twiddle> make_twiddles(int N) {
    std::vector<Twiddle> W(N/2);
    const float PI = std::acos(-1.0f);
    for(int k = 0; k < N/2; ++k) {
        float angle = -2.0f * PI * k / N;
        W[k].re = std::cos(angle);
        W[k].im = std::sin(angle);
    }
    return W;
}

// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
                       int N, bool inverse = false)
{
    assert((N & (N - 1)) == 0); // power of two
    auto W = make_twiddles(N);
    const float inv_sign = inverse ? +1.0f : -1.0f;

    std::complex<float>* in = data;
    std::complex<float>* out = temp;

    int logN = 0;
    while ((1 << logN) < N) ++logN;

    for(int stage = 0; stage < logN; ++stage) {
        int m = 1 << (stage + 1);
        int half_m = m >> 1;
        // stride between groups
        int group_stride = N / m;

        for(int k = 0; k < N; k += m) {
            for(int j = 0; j < half_m; ++j) {
                // twiddle W_index:
                int w_index = j * group_stride;
                float w_re = W[w_index].re;
                float w_im = inv_sign * W[w_index].im;  // invert sign for inverse

                // Load w_re, w_im (we can broadcast them)
                __m256 w_re_b = _mm256_set1_ps(w_re);
                __m256 w_im_b = _mm256_set1_ps(w_im);

                // Process 4 complex numbers at once in the j position strides
                // The 4 complex numbers are from positions:
                // in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
                // But that depends on how many half_m, whether half_m >=4 etc.
                // For simplicity, require half_m >=4 in vectorized branch.

                if (half_m >= 4 && (j + 3*half_m) < N) {
                    // Load real parts
                    float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
                    float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
                    // Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
                    // We need gather 4 complex u's and t's with step half_m.

                    // Load u (4 complex): u0, u1, u2, u3
                    __m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
                    __m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));

                    // Complex multiply t0 * w
                    // Split t0 into re, im
                    __m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
                    __m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im

                    __m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);

                    // Now compute:
                    // out[k/2 + j + 0] = u + t * w
                    // out[k/2 + j + N/2 + j] = u - t * w
                    // But with vector, we do elementwise addition/subtraction

                    __m256 sum = _mm256_add_ps(u0, mul);
                    __m256 diff = _mm256_sub_ps(u0, mul);

                    // Store sum and diff to their respective locations in out
                    // Need to compute positions:

                    // Position for “sum”:
                    std::complex<float>* out_sum = &out[k/2 + j];
                    std::complex<float>* out_diff = &out[k/2 + j + N/2];

                    // Store
                    _mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
                    _mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);

                } else {
                    // Fallback scalar for j's not fitting vectorization
                    auto u = in[k + j];
                    auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);

                    out[k/2 + j] = u + t;
                    out[k/2 + j + N/2] = u - t;
                }
            }
        }

        // Swap in/out buffers
        std::swap(in, out);
    }

    // If number of stages is odd, data is currently in temp
    if (logN & 1) {
        for(int i = 0; i < N; ++i)
            data[i] = in[i];
    }

    // Normalize for inverse
    if (inverse) {
        float invN = 1.0f / N;
        for(int i = 0; i < N; ++i) {
            data[i] *= invN;
        }
    }
}


void fft(std::vector<std::complex<float>> &x, bool inv = false)
{
    const size_t N = x.size();
    if (N <= 1) return;

    // Bit-reversed addressing permutation
    size_t j = 0;
    for(size_t i = 1; i < N; ++i)
    {
        size_t bit = N >> 1;
        while(j & bit)
        {
            j ^= bit;
            bit >>= 1;
        }
        j ^= bit;

        if (i < j)
        {
            std::swap(x[i], x[j]);
        }
    }

    // Iterative FFT
    for(size_t len = 2; len <= N; len <<= 1)
    {
        double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
        std::complex<float> wlen(std::cos(angle), std::sin(angle));
        for(size_t i = 0; i < N; i += len)
        {
            std::complex<float> w(1);
            for(size_t j = 0; j < len / 2; ++j)
            {
                std::complex<float> u = x[i + j];
                std::complex<float> v = x[i + j + len / 2] * w;
                x[i + j] = u + v;
                x[i + j + len / 2] = u - v;
                w *= wlen;
            }
        }
    }

    if(inv)
    {
        for(size_t i = 0; i < N; i++)
            x[i] /= N;
    }
}

double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
{
    __m256 mean = _mm256_setzero_ps();
    __m256 M2 = _mm256_setzero_ps();
    uint32_t count = 0;
    for(uint32_t y = 1; y < height - 1; y++)
    {
        uint32_t row = (y - 1) * width;
        for(uint32_t x = 1; x < width - 17; x += 16)
        {
            __m256i p0 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + row + x));
            __m256i p1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x - 1));
            __m256i p2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x));
            __m256i p3 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x + 1));
            __m256i p4 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width * 2) + x));

            __m256i sumA = _mm256_setzero_si256();
            __m256i sumB = _mm256_setzero_si256();

            __m256i a,b;
            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 0));
            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 1));
            sumA = _mm256_add_epi32(sumA, a);
            sumB = _mm256_add_epi32(sumB, b);

            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 0));
            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 1));
            sumA = _mm256_add_epi32(sumA, a);
            sumB = _mm256_add_epi32(sumB, b);

            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 0));
            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 1));
            a = _mm256_sll_epi32(a, _mm_set1_epi64x(2));
            b = _mm256_sll_epi32(b, _mm_set1_epi64x(2));
            sumA = _mm256_sub_epi32(sumA, a);
            sumB = _mm256_sub_epi32(sumB, b);

            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 0));
            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 1));
            sumA = _mm256_add_epi32(sumA, a);
            sumB = _mm256_add_epi32(sumB, b);

            a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 0));
            b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 1));
            sumA = _mm256_add_epi32(sumA, a);
            sumB = _mm256_add_epi32(sumB, b);

            if(out)
            {
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sumA);
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x + 8), sumB);
            }

            __m256 af = _mm256_cvtepi32_ps(sumA);
            __m256 bf = _mm256_cvtepi32_ps(sumB);

            count++;
            __m256 delta = _mm256_sub_ps(af, mean);
            mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
            __m256 delta2 = _mm256_sub_ps(af, mean);
            M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));

            count++;
            delta = _mm256_sub_ps(bf, mean);
            mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
            delta2 = _mm256_sub_ps(bf, mean);
            M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));

            //count += 1
            //delta = new_value - mean
            //mean += delta / count
            //delta2 = new_value - mean
            //M2 += delta * delta2
        }
    }
    float mean_2[8];
    float M2_2[8];
    _mm256_storeu_ps(mean_2, mean);
    _mm256_storeu_ps(M2_2, M2);

    auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
    {
        uint32_t count = 2 * n;
        float delta = mean_2 - mean_1;
        float mean = mean_1  + delta * ((float)n / count);
        float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
        mean_1 = mean;
        M2_1 = M2;
    };

    for(int i = 0; i < 8; i++)
        qDebug() << M2_2[i] / count;

    welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
    welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
    welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
    welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);

    welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
    welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);

    welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);

    return (double)M2_2[0] / (count * 8);
}

int main(int argc, char *argv[])
{
    SERFileReader ser;
    ser.open("/home/nou/.wine/drive_c/indi_2025-10-03/indi_record_2025-10-03@18-24-37.ser");

    cv::Rect rect(1024, 1024, 128, 128);
    double maxQ = 0;
    cv::Mat best;
    cv::Mat lap;
    cv::Mat first(ser.height(), ser.width(), CV_16U);
    cv::Mat img(ser.height(), ser.width(), CV_16U);
    cv::Mat out(ser.height(), ser.width(), CV_32S);
    cv::Mat imgf32;
    ser.getFrame(0, (char*)first.data);
    first.convertTo(first, CV_32F);
    for(uint32_t i = 0; i < ser.frameCount(); i++)
    {
        ser.getFrame(i, (char*)img.data);
        double var = laplacian((uint16_t*)img.data, (int32_t*)out.data, img.cols, img.rows);
        double minval, maxval;
        cv::minMaxLoc(out, &minval, &maxval);
        out.convertTo(out, CV_32F, 1.0 / (maxval - minval), -minval / (maxval - minval));
        qDebug() << "minmax" << minval << maxval;
        cv::imshow("lap", out);

        img.convertTo(imgf32, CV_32F);
        cv::Laplacian(imgf32, lap, CV_32F, 1);
        cv::minMaxLoc(lap, &minval, &maxval);
        qDebug() << "minmax" << minval << maxval;
        cv::Mat stddev;
        cv::Mat mean;
        cv::meanStdDev(lap, mean, stddev);
        lap -= minval;
        lap /= (maxval - minval);
        cv::imshow("lapcv", lap);
        cv::waitKey();
        qDebug() << var << std::sqrt(var) << stddev.at<double>(0);
        //continue;
        return 0;

        img.convertTo(imgf32, CV_32F);
        cv::Laplacian(imgf32, lap, CV_32F, 1);
        cv::Point2d off = cv::phaseCorrelate(first(rect), imgf32(rect));
        if(maxQ < stddev.at<double>(0))
        {
            maxQ = stddev.at<double>(0);
            img.copyTo(best);
            //qDebug() << "new best" << i;
        }
    }
    cv::imshow("lap", best);
    cv::waitKeyEx();

    return 0;


    cv::Mat img1= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/R.tif", cv::IMREAD_GRAYSCALE);
    cv::Mat img2= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/G.tif", cv::IMREAD_GRAYSCALE);

    img1.convertTo(img1, CV_32F);
    img2.convertTo(img2, CV_32F);

    cv::Point2d point = cv::phaseCorrelate(img1, img2);


    cv::Mat img2dst;
    cv::Mat t(2, 3, CV_32F);
    t.at<float>(0, 0) = 1.0;
    t.at<float>(1, 1) = 1.0;
    t.at<float>(0, 2) = -point.x;
    t.at<float>(1, 2) = -point.y;
    cv::warpAffine(img2, img2dst, t, img1.size());

    //cv::imshow("img1", img1 / 64.0);
    auto diff = img1 - img2dst;
    double min, max;
    cv::minMaxLoc(diff, &min, &max);
    qDebug()  << min << max;
    cv::imshow("img2", (diff - min) / (32));
    cv::imwrite("diff.png", (diff - min) / (max - min) * 255);
    cv::imwrite("avg.png", (img1 + img2dst) * 0.5);

    cv::minMaxLoc(img1, &min, &max);
    qDebug()  << min << max;
    cv::waitKey();


    return 0;
    /*QApplication a(argc, argv);
    MainWindow w;
    w.show();
    return a.exec();*/
}