kouryu/main.cpp

#include "mainwindow.h"

#include "serfile.h"
#include <QApplication>
#include <QElapsedTimer>
#include <QThread>
#include <QMutex>
#include <QWaitCondition>
#include <QQueue>
#include <opencv2/opencv.hpp>
#include <complex>
#include <cmath>

#include <immintrin.h>
#include <vector>
#include <complex>
#include <cassert>

// Twiddle factor struct
struct Twiddle {
    float re;
    float im;
};

// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
    // (are + i aim) * (bre + i bim)
    __m256 ac = _mm256_mul_ps(are, bre);
    __m256 bd = _mm256_mul_ps(aim, bim);
    __m256 ad = _mm256_mul_ps(are, bim);
    __m256 bc = _mm256_mul_ps(aim, bre);
    // real = ac - bd
    __m256 real = _mm256_sub_ps(ac, bd);
    // imag = ad + bc
    __m256 imag = _mm256_add_ps(ad, bc);

    // We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
    // But because we process 4 complex numbers in the vector, we need shuffling
    // to interleave real and imag properly. But for simplicity, assume the calling
    // code expects separate real & imag vectors, or we’ll implement interleaving.
    // Here, return real in lower 128 bits & imag in upper, or some scheme.
    // For clarity: return real in output lane 0‑127, imag in 128‑255.
    // But better is to use separate vectors for real & imag, or AoS with shuffles.

    // Pack: in low half real, in high half imag
    return _mm256_blend_ps(real, imag, 0xF0);
    // 0xF0 = upper 4 lanes from imag
}

// Precompute twiddles
static std::vector<Twiddle> make_twiddles(int N) {
    std::vector<Twiddle> W(N/2);
    const float PI = std::acos(-1.0f);
    for(int k = 0; k < N/2; ++k) {
        float angle = -2.0f * PI * k / N;
        W[k].re = std::cos(angle);
        W[k].im = std::sin(angle);
    }
    return W;
}

// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
                       int N, bool inverse = false)
{
    assert((N & (N - 1)) == 0); // power of two
    auto W = make_twiddles(N);
    const float inv_sign = inverse ? +1.0f : -1.0f;

    std::complex<float>* in = data;
    std::complex<float>* out = temp;

    int logN = 0;
    while ((1 << logN) < N) ++logN;

    for(int stage = 0; stage < logN; ++stage) {
        int m = 1 << (stage + 1);
        int half_m = m >> 1;
        // stride between groups
        int group_stride = N / m;

        for(int k = 0; k < N; k += m) {
            for(int j = 0; j < half_m; ++j) {
                // twiddle W_index:
                int w_index = j * group_stride;
                float w_re = W[w_index].re;
                float w_im = inv_sign * W[w_index].im;  // invert sign for inverse

                // Load w_re, w_im (we can broadcast them)
                __m256 w_re_b = _mm256_set1_ps(w_re);
                __m256 w_im_b = _mm256_set1_ps(w_im);

                // Process 4 complex numbers at once in the j position strides
                // The 4 complex numbers are from positions:
                // in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
                // But that depends on how many half_m, whether half_m >=4 etc.
                // For simplicity, require half_m >=4 in vectorized branch.

                if (half_m >= 4 && (j + 3*half_m) < N) {
                    // Load real parts
                    float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
                    float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
                    // Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
                    // We need gather 4 complex u's and t's with step half_m.

                    // Load u (4 complex): u0, u1, u2, u3
                    __m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
                    __m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));

                    // Complex multiply t0 * w
                    // Split t0 into re, im
                    __m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
                    __m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im

                    __m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);

                    // Now compute:
                    // out[k/2 + j + 0] = u + t * w
                    // out[k/2 + j + N/2 + j] = u - t * w
                    // But with vector, we do elementwise addition/subtraction

                    __m256 sum = _mm256_add_ps(u0, mul);
                    __m256 diff = _mm256_sub_ps(u0, mul);

                    // Store sum and diff to their respective locations in out
                    // Need to compute positions:

                    // Position for “sum”:
                    std::complex<float>* out_sum = &out[k/2 + j];
                    std::complex<float>* out_diff = &out[k/2 + j + N/2];

                    // Store
                    _mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
                    _mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);

                } else {
                    // Fallback scalar for j's not fitting vectorization
                    auto u = in[k + j];
                    auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);

                    out[k/2 + j] = u + t;
                    out[k/2 + j + N/2] = u - t;
                }
            }
        }

        // Swap in/out buffers
        std::swap(in, out);
    }

    // If number of stages is odd, data is currently in temp
    if (logN & 1) {
        for(int i = 0; i < N; ++i)
            data[i] = in[i];
    }

    // Normalize for inverse
    if (inverse) {
        float invN = 1.0f / N;
        for(int i = 0; i < N; ++i) {
            data[i] *= invN;
        }
    }
}


void fft(std::vector<std::complex<float>> &x, bool inv = false)
{
    const size_t N = x.size();
    if (N <= 1) return;

    // Bit-reversed addressing permutation
    size_t j = 0;
    for(size_t i = 1; i < N; ++i)
    {
        size_t bit = N >> 1;
        while(j & bit)
        {
            j ^= bit;
            bit >>= 1;
        }
        j ^= bit;

        if (i < j)
        {
            std::swap(x[i], x[j]);
        }
    }

    // Iterative FFT
    for(size_t len = 2; len <= N; len <<= 1)
    {
        double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
        std::complex<float> wlen(std::cos(angle), std::sin(angle));
        for(size_t i = 0; i < N; i += len)
        {
            std::complex<float> w(1);
            for(size_t j = 0; j < len / 2; ++j)
            {
                std::complex<float> u = x[i + j];
                std::complex<float> v = x[i + j + len / 2] * w;
                x[i + j] = u + v;
                x[i + j + len / 2] = u - v;
                w *= wlen;
            }
        }
    }

    if(inv)
    {
        for(size_t i = 0; i < N; i++)
            x[i] /= N;
    }
}


//光流
void opticalflow()
{
    cv::Mat frame1, prvs;
    SERFileReader ser;
    ser.open("/media/data/indi_2025-09-29/indi_record_2025-09-29@17-29-08.ser");
    frame1 = cv::Mat(ser.height(), ser.width(), CV_16U);
    ser.getFrame(0, (char*)frame1.data);

    cv::Mat frame2(ser.height(), ser.width(), CV_16U);
    for(int i=0; i<ser.frameCount(); i++)
    {
        ser.getFrame(i, (char*)frame2.data);
        if (frame2.empty())
            break;

        cv::Mat flow(prvs.size(), CV_32FC2);
        cv::calcOpticalFlowFarneback(frame1, frame2, flow, 0.5, 3, 40, 3, 5, 1.2, 0);


        std::vector<cv::Mat> flow_xy(2);
        cv::split(flow, flow_xy);
        cv::Mat flow_x = flow_xy[0];
        cv::Mat flow_y = flow_xy[1];

        // --- Build map_x and map_y for remapping
        cv::Mat map_x(frame1.size(), CV_32FC1);
        cv::Mat map_y(frame1.size(), CV_32FC1);

        for (int y = 0; y < frame1.rows; y++) {
            for (int x = 0; x < frame1.cols; x++) {
                map_x.at<float>(y, x) = x + flow_x.at<float>(y, x);
                map_y.at<float>(y, x) = y + flow_y.at<float>(y, x);
            }
        }

        // --- Warp img1 to align it with img2 using the optical flow
        cv::Mat warped;
        cv::remap(frame2, warped, map_x, map_y, cv::INTER_LANCZOS4);
        cv::imshow("orig", frame2);
        cv::imshow("warp", warped);
        /*int key = cv::waitKey(3);
        if (key == 'q' || key == 27)
            break;
        continue;*/

        // visualization
        cv::Mat flow_parts[2];
        split(flow, flow_parts);
        cv::Mat magnitude, angle, magn_norm;
        cv::cartToPolar(flow_parts[0], flow_parts[1], magnitude, angle, true);
        cv::normalize(magnitude, magn_norm, 0.0f, 1.0f, cv::NORM_MINMAX);
        cv::imshow("mag", magn_norm);
        /*angle *= ((1.f / 360.f) * (180.f / 255.f));
        //build hsv image
        cv::Mat _hsv[3], hsv, hsv8, bgr;
        _hsv[0] = angle;
        _hsv[1] = cv::Mat::ones(angle.size(), CV_32F);
        _hsv[2] = magn_norm;
        merge(_hsv, 3, hsv);
        hsv.convertTo(hsv8, CV_8U, 255.0);
        cvtColor(hsv8, bgr, cv::COLOR_HSV2BGR);
        imshow("frame2", bgr);*/
        int keyboard = cv::waitKey(30);
        if (keyboard == 'q' || keyboard == 27)
            break;
    }
}

int main(int argc, char *argv[])
{
    QApplication a(argc, argv);
    MainWindow w;
    w.show();
    return a.exec();
}