Files
kouryu/main.cpp
T
2025-10-14 18:14:01 +02:00

294 lines
9.3 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "mainwindow.h"
#include "serfile.h"
#include <QApplication>
#include <QElapsedTimer>
#include <QThread>
#include <QMutex>
#include <QWaitCondition>
#include <QQueue>
#include <opencv2/opencv.hpp>
#include <complex>
#include <cmath>
#include <immintrin.h>
#include <vector>
#include <complex>
#include <cassert>
// Twiddle factor struct
struct Twiddle {
float re;
float im;
};
// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
// (are + i aim) * (bre + i bim)
__m256 ac = _mm256_mul_ps(are, bre);
__m256 bd = _mm256_mul_ps(aim, bim);
__m256 ad = _mm256_mul_ps(are, bim);
__m256 bc = _mm256_mul_ps(aim, bre);
// real = ac - bd
__m256 real = _mm256_sub_ps(ac, bd);
// imag = ad + bc
__m256 imag = _mm256_add_ps(ad, bc);
// We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
// But because we process 4 complex numbers in the vector, we need shuffling
// to interleave real and imag properly. But for simplicity, assume the calling
// code expects separate real & imag vectors, or well implement interleaving.
// Here, return real in lower 128 bits & imag in upper, or some scheme.
// For clarity: return real in output lane 0127, imag in 128255.
// But better is to use separate vectors for real & imag, or AoS with shuffles.
// Pack: in low half real, in high half imag
return _mm256_blend_ps(real, imag, 0xF0);
// 0xF0 = upper 4 lanes from imag
}
// Precompute twiddles
static std::vector<Twiddle> make_twiddles(int N) {
std::vector<Twiddle> W(N/2);
const float PI = std::acos(-1.0f);
for(int k = 0; k < N/2; ++k) {
float angle = -2.0f * PI * k / N;
W[k].re = std::cos(angle);
W[k].im = std::sin(angle);
}
return W;
}
// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
int N, bool inverse = false)
{
assert((N & (N - 1)) == 0); // power of two
auto W = make_twiddles(N);
const float inv_sign = inverse ? +1.0f : -1.0f;
std::complex<float>* in = data;
std::complex<float>* out = temp;
int logN = 0;
while ((1 << logN) < N) ++logN;
for(int stage = 0; stage < logN; ++stage) {
int m = 1 << (stage + 1);
int half_m = m >> 1;
// stride between groups
int group_stride = N / m;
for(int k = 0; k < N; k += m) {
for(int j = 0; j < half_m; ++j) {
// twiddle W_index:
int w_index = j * group_stride;
float w_re = W[w_index].re;
float w_im = inv_sign * W[w_index].im; // invert sign for inverse
// Load w_re, w_im (we can broadcast them)
__m256 w_re_b = _mm256_set1_ps(w_re);
__m256 w_im_b = _mm256_set1_ps(w_im);
// Process 4 complex numbers at once in the j position strides
// The 4 complex numbers are from positions:
// in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
// But that depends on how many half_m, whether half_m >=4 etc.
// For simplicity, require half_m >=4 in vectorized branch.
if (half_m >= 4 && (j + 3*half_m) < N) {
// Load real parts
float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
// Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
// We need gather 4 complex u's and t's with step half_m.
// Load u (4 complex): u0, u1, u2, u3
__m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
__m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));
// Complex multiply t0 * w
// Split t0 into re, im
__m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
__m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im
__m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);
// Now compute:
// out[k/2 + j + 0] = u + t * w
// out[k/2 + j + N/2 + j] = u - t * w
// But with vector, we do elementwise addition/subtraction
__m256 sum = _mm256_add_ps(u0, mul);
__m256 diff = _mm256_sub_ps(u0, mul);
// Store sum and diff to their respective locations in out
// Need to compute positions:
// Position for “sum”:
std::complex<float>* out_sum = &out[k/2 + j];
std::complex<float>* out_diff = &out[k/2 + j + N/2];
// Store
_mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
_mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);
} else {
// Fallback scalar for j's not fitting vectorization
auto u = in[k + j];
auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);
out[k/2 + j] = u + t;
out[k/2 + j + N/2] = u - t;
}
}
}
// Swap in/out buffers
std::swap(in, out);
}
// If number of stages is odd, data is currently in temp
if (logN & 1) {
for(int i = 0; i < N; ++i)
data[i] = in[i];
}
// Normalize for inverse
if (inverse) {
float invN = 1.0f / N;
for(int i = 0; i < N; ++i) {
data[i] *= invN;
}
}
}
void fft(std::vector<std::complex<float>> &x, bool inv = false)
{
const size_t N = x.size();
if (N <= 1) return;
// Bit-reversed addressing permutation
size_t j = 0;
for(size_t i = 1; i < N; ++i)
{
size_t bit = N >> 1;
while(j & bit)
{
j ^= bit;
bit >>= 1;
}
j ^= bit;
if (i < j)
{
std::swap(x[i], x[j]);
}
}
// Iterative FFT
for(size_t len = 2; len <= N; len <<= 1)
{
double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
std::complex<float> wlen(std::cos(angle), std::sin(angle));
for(size_t i = 0; i < N; i += len)
{
std::complex<float> w(1);
for(size_t j = 0; j < len / 2; ++j)
{
std::complex<float> u = x[i + j];
std::complex<float> v = x[i + j + len / 2] * w;
x[i + j] = u + v;
x[i + j + len / 2] = u - v;
w *= wlen;
}
}
}
if(inv)
{
for(size_t i = 0; i < N; i++)
x[i] /= N;
}
}
//光流
void opticalflow()
{
cv::Mat frame1, prvs;
SERFileReader ser;
ser.open("/media/data/indi_2025-09-29/indi_record_2025-09-29@17-29-08.ser");
frame1 = cv::Mat(ser.height(), ser.width(), CV_16U);
ser.getFrame(0, (char*)frame1.data);
cv::Mat frame2(ser.height(), ser.width(), CV_16U);
for(int i=0; i<ser.frameCount(); i++)
{
ser.getFrame(i, (char*)frame2.data);
if (frame2.empty())
break;
cv::Mat flow(prvs.size(), CV_32FC2);
cv::calcOpticalFlowFarneback(frame1, frame2, flow, 0.5, 3, 40, 3, 5, 1.2, 0);
std::vector<cv::Mat> flow_xy(2);
cv::split(flow, flow_xy);
cv::Mat flow_x = flow_xy[0];
cv::Mat flow_y = flow_xy[1];
// --- Build map_x and map_y for remapping
cv::Mat map_x(frame1.size(), CV_32FC1);
cv::Mat map_y(frame1.size(), CV_32FC1);
for (int y = 0; y < frame1.rows; y++) {
for (int x = 0; x < frame1.cols; x++) {
map_x.at<float>(y, x) = x + flow_x.at<float>(y, x);
map_y.at<float>(y, x) = y + flow_y.at<float>(y, x);
}
}
// --- Warp img1 to align it with img2 using the optical flow
cv::Mat warped;
cv::remap(frame2, warped, map_x, map_y, cv::INTER_LANCZOS4);
cv::imshow("orig", frame2);
cv::imshow("warp", warped);
/*int key = cv::waitKey(3);
if (key == 'q' || key == 27)
break;
continue;*/
// visualization
cv::Mat flow_parts[2];
split(flow, flow_parts);
cv::Mat magnitude, angle, magn_norm;
cv::cartToPolar(flow_parts[0], flow_parts[1], magnitude, angle, true);
cv::normalize(magnitude, magn_norm, 0.0f, 1.0f, cv::NORM_MINMAX);
cv::imshow("mag", magn_norm);
/*angle *= ((1.f / 360.f) * (180.f / 255.f));
//build hsv image
cv::Mat _hsv[3], hsv, hsv8, bgr;
_hsv[0] = angle;
_hsv[1] = cv::Mat::ones(angle.size(), CV_32F);
_hsv[2] = magn_norm;
merge(_hsv, 3, hsv);
hsv.convertTo(hsv8, CV_8U, 255.0);
cvtColor(hsv8, bgr, cv::COLOR_HSV2BGR);
imshow("frame2", bgr);*/
int keyboard = cv::waitKey(30);
if (keyboard == 'q' || keyboard == 27)
break;
}
}
int main(int argc, char *argv[])
{
QApplication a(argc, argv);
MainWindow w;
w.show();
return a.exec();
}