411 lines
14 KiB
C++
411 lines
14 KiB
C++
#include "mainwindow.h"
|
||
|
||
#include "serfile.h"
|
||
#include <QApplication>
|
||
#include <opencv2/opencv.hpp>
|
||
#include <iostream>
|
||
#include <complex>
|
||
#include <cmath>
|
||
|
||
#include <immintrin.h>
|
||
#include <vector>
|
||
#include <complex>
|
||
#include <cassert>
|
||
|
||
// Twiddle factor struct
|
||
struct Twiddle {
|
||
float re;
|
||
float im;
|
||
};
|
||
|
||
// Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c)
|
||
inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) {
|
||
// (are + i aim) * (bre + i bim)
|
||
__m256 ac = _mm256_mul_ps(are, bre);
|
||
__m256 bd = _mm256_mul_ps(aim, bim);
|
||
__m256 ad = _mm256_mul_ps(are, bim);
|
||
__m256 bc = _mm256_mul_ps(aim, bre);
|
||
// real = ac - bd
|
||
__m256 real = _mm256_sub_ps(ac, bd);
|
||
// imag = ad + bc
|
||
__m256 imag = _mm256_add_ps(ad, bc);
|
||
|
||
// We pack real, imag as [r0, i0, r1, i1, ..., r3, i3]
|
||
// But because we process 4 complex numbers in the vector, we need shuffling
|
||
// to interleave real and imag properly. But for simplicity, assume the calling
|
||
// code expects separate real & imag vectors, or we’ll implement interleaving.
|
||
// Here, return real in lower 128 bits & imag in upper, or some scheme.
|
||
// For clarity: return real in output lane 0‑127, imag in 128‑255.
|
||
// But better is to use separate vectors for real & imag, or AoS with shuffles.
|
||
|
||
// Pack: in low half real, in high half imag
|
||
return _mm256_blend_ps(real, imag, 0xF0);
|
||
// 0xF0 = upper 4 lanes from imag
|
||
}
|
||
|
||
// Precompute twiddles
|
||
static std::vector<Twiddle> make_twiddles(int N) {
|
||
std::vector<Twiddle> W(N/2);
|
||
const float PI = std::acos(-1.0f);
|
||
for(int k = 0; k < N/2; ++k) {
|
||
float angle = -2.0f * PI * k / N;
|
||
W[k].re = std::cos(angle);
|
||
W[k].im = std::sin(angle);
|
||
}
|
||
return W;
|
||
}
|
||
|
||
// Stockham FFT with AVX2 for complex<float> (AoS: interleaved real, imag)
|
||
void stockham_fft_avx2(std::complex<float>* data, std::complex<float>* temp,
|
||
int N, bool inverse = false)
|
||
{
|
||
assert((N & (N - 1)) == 0); // power of two
|
||
auto W = make_twiddles(N);
|
||
const float inv_sign = inverse ? +1.0f : -1.0f;
|
||
|
||
std::complex<float>* in = data;
|
||
std::complex<float>* out = temp;
|
||
|
||
int logN = 0;
|
||
while ((1 << logN) < N) ++logN;
|
||
|
||
for(int stage = 0; stage < logN; ++stage) {
|
||
int m = 1 << (stage + 1);
|
||
int half_m = m >> 1;
|
||
// stride between groups
|
||
int group_stride = N / m;
|
||
|
||
for(int k = 0; k < N; k += m) {
|
||
for(int j = 0; j < half_m; ++j) {
|
||
// twiddle W_index:
|
||
int w_index = j * group_stride;
|
||
float w_re = W[w_index].re;
|
||
float w_im = inv_sign * W[w_index].im; // invert sign for inverse
|
||
|
||
// Load w_re, w_im (we can broadcast them)
|
||
__m256 w_re_b = _mm256_set1_ps(w_re);
|
||
__m256 w_im_b = _mm256_set1_ps(w_im);
|
||
|
||
// Process 4 complex numbers at once in the j position strides
|
||
// The 4 complex numbers are from positions:
|
||
// in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m]
|
||
// But that depends on how many half_m, whether half_m >=4 etc.
|
||
// For simplicity, require half_m >=4 in vectorized branch.
|
||
|
||
if (half_m >= 4 && (j + 3*half_m) < N) {
|
||
// Load real parts
|
||
float *ptr_u = reinterpret_cast<float*>(&in[k + j]);
|
||
float *ptr_t0 = reinterpret_cast<float*>(&in[k + j + half_m]);
|
||
// Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...]
|
||
// We need gather 4 complex u's and t's with step half_m.
|
||
|
||
// Load u (4 complex): u0, u1, u2, u3
|
||
__m256 u0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j]));
|
||
__m256 t0 = _mm256_loadu_ps(reinterpret_cast<float*>(&in[k + j + half_m]));
|
||
|
||
// Complex multiply t0 * w
|
||
// Split t0 into re, im
|
||
__m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes
|
||
__m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im
|
||
|
||
__m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b);
|
||
|
||
// Now compute:
|
||
// out[k/2 + j + 0] = u + t * w
|
||
// out[k/2 + j + N/2 + j] = u - t * w
|
||
// But with vector, we do elementwise addition/subtraction
|
||
|
||
__m256 sum = _mm256_add_ps(u0, mul);
|
||
__m256 diff = _mm256_sub_ps(u0, mul);
|
||
|
||
// Store sum and diff to their respective locations in out
|
||
// Need to compute positions:
|
||
|
||
// Position for “sum”:
|
||
std::complex<float>* out_sum = &out[k/2 + j];
|
||
std::complex<float>* out_diff = &out[k/2 + j + N/2];
|
||
|
||
// Store
|
||
_mm256_storeu_ps(reinterpret_cast<float*>(out_sum), sum);
|
||
_mm256_storeu_ps(reinterpret_cast<float*>(out_diff), diff);
|
||
|
||
} else {
|
||
// Fallback scalar for j's not fitting vectorization
|
||
auto u = in[k + j];
|
||
auto t = in[k + j + half_m] * std::complex<float>(w_re, w_im);
|
||
|
||
out[k/2 + j] = u + t;
|
||
out[k/2 + j + N/2] = u - t;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Swap in/out buffers
|
||
std::swap(in, out);
|
||
}
|
||
|
||
// If number of stages is odd, data is currently in temp
|
||
if (logN & 1) {
|
||
for(int i = 0; i < N; ++i)
|
||
data[i] = in[i];
|
||
}
|
||
|
||
// Normalize for inverse
|
||
if (inverse) {
|
||
float invN = 1.0f / N;
|
||
for(int i = 0; i < N; ++i) {
|
||
data[i] *= invN;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
void fft(std::vector<std::complex<float>> &x, bool inv = false)
|
||
{
|
||
const size_t N = x.size();
|
||
if (N <= 1) return;
|
||
|
||
// Bit-reversed addressing permutation
|
||
size_t j = 0;
|
||
for(size_t i = 1; i < N; ++i)
|
||
{
|
||
size_t bit = N >> 1;
|
||
while(j & bit)
|
||
{
|
||
j ^= bit;
|
||
bit >>= 1;
|
||
}
|
||
j ^= bit;
|
||
|
||
if (i < j)
|
||
{
|
||
std::swap(x[i], x[j]);
|
||
}
|
||
}
|
||
|
||
// Iterative FFT
|
||
for(size_t len = 2; len <= N; len <<= 1)
|
||
{
|
||
double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len);
|
||
std::complex<float> wlen(std::cos(angle), std::sin(angle));
|
||
for(size_t i = 0; i < N; i += len)
|
||
{
|
||
std::complex<float> w(1);
|
||
for(size_t j = 0; j < len / 2; ++j)
|
||
{
|
||
std::complex<float> u = x[i + j];
|
||
std::complex<float> v = x[i + j + len / 2] * w;
|
||
x[i + j] = u + v;
|
||
x[i + j + len / 2] = u - v;
|
||
w *= wlen;
|
||
}
|
||
}
|
||
}
|
||
|
||
if(inv)
|
||
{
|
||
for(size_t i = 0; i < N; i++)
|
||
x[i] /= N;
|
||
}
|
||
}
|
||
|
||
double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height)
|
||
{
|
||
__m256 mean = _mm256_setzero_ps();
|
||
__m256 M2 = _mm256_setzero_ps();
|
||
uint32_t count = 0;
|
||
for(uint32_t y = 1; y < height - 1; y++)
|
||
{
|
||
uint32_t row = (y - 1) * width;
|
||
for(uint32_t x = 1; x < width - 17; x += 16)
|
||
{
|
||
__m256i p0 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + row + x));
|
||
__m256i p1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x - 1));
|
||
__m256i p2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x));
|
||
__m256i p3 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x + 1));
|
||
__m256i p4 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width * 2) + x));
|
||
|
||
__m256i sumA = _mm256_setzero_si256();
|
||
__m256i sumB = _mm256_setzero_si256();
|
||
|
||
__m256i a,b;
|
||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 0));
|
||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 1));
|
||
sumA = _mm256_add_epi32(sumA, a);
|
||
sumB = _mm256_add_epi32(sumB, b);
|
||
|
||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 0));
|
||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 1));
|
||
sumA = _mm256_add_epi32(sumA, a);
|
||
sumB = _mm256_add_epi32(sumB, b);
|
||
|
||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 0));
|
||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 1));
|
||
a = _mm256_sll_epi32(a, _mm_set1_epi64x(2));
|
||
b = _mm256_sll_epi32(b, _mm_set1_epi64x(2));
|
||
sumA = _mm256_sub_epi32(sumA, a);
|
||
sumB = _mm256_sub_epi32(sumB, b);
|
||
|
||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 0));
|
||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 1));
|
||
sumA = _mm256_add_epi32(sumA, a);
|
||
sumB = _mm256_add_epi32(sumB, b);
|
||
|
||
a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 0));
|
||
b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 1));
|
||
sumA = _mm256_add_epi32(sumA, a);
|
||
sumB = _mm256_add_epi32(sumB, b);
|
||
|
||
if(out)
|
||
{
|
||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sumA);
|
||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x + 8), sumB);
|
||
}
|
||
|
||
__m256 af = _mm256_cvtepi32_ps(sumA);
|
||
__m256 bf = _mm256_cvtepi32_ps(sumB);
|
||
|
||
count++;
|
||
__m256 delta = _mm256_sub_ps(af, mean);
|
||
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
|
||
__m256 delta2 = _mm256_sub_ps(af, mean);
|
||
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
|
||
|
||
count++;
|
||
delta = _mm256_sub_ps(bf, mean);
|
||
mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast<float>(count))));
|
||
delta2 = _mm256_sub_ps(bf, mean);
|
||
M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2));
|
||
|
||
//count += 1
|
||
//delta = new_value - mean
|
||
//mean += delta / count
|
||
//delta2 = new_value - mean
|
||
//M2 += delta * delta2
|
||
}
|
||
}
|
||
float mean_2[8];
|
||
float M2_2[8];
|
||
_mm256_storeu_ps(mean_2, mean);
|
||
_mm256_storeu_ps(M2_2, M2);
|
||
|
||
auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2)
|
||
{
|
||
uint32_t count = 2 * n;
|
||
float delta = mean_2 - mean_1;
|
||
float mean = mean_1 + delta * ((float)n / count);
|
||
float M2 = M2_1 + M2_2 + delta * delta * n * n / count;
|
||
mean_1 = mean;
|
||
M2_1 = M2;
|
||
};
|
||
|
||
for(int i = 0; i < 8; i++)
|
||
qDebug() << M2_2[i] / count;
|
||
|
||
welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]);
|
||
welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]);
|
||
welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]);
|
||
welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]);
|
||
|
||
welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]);
|
||
welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]);
|
||
|
||
welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]);
|
||
|
||
return (double)M2_2[0] / (count * 8);
|
||
}
|
||
|
||
int main(int argc, char *argv[])
|
||
{
|
||
SERFileReader ser;
|
||
ser.open("/home/nou/.wine/drive_c/indi_2025-10-03/indi_record_2025-10-03@18-24-37.ser");
|
||
|
||
cv::Rect rect(1024, 1024, 128, 128);
|
||
double maxQ = 0;
|
||
cv::Mat best;
|
||
cv::Mat lap;
|
||
cv::Mat first(ser.height(), ser.width(), CV_16U);
|
||
cv::Mat img(ser.height(), ser.width(), CV_16U);
|
||
cv::Mat out(ser.height(), ser.width(), CV_32S);
|
||
cv::Mat imgf32;
|
||
ser.getFrame(0, (char*)first.data);
|
||
first.convertTo(first, CV_32F);
|
||
for(uint32_t i = 0; i < ser.frameCount(); i++)
|
||
{
|
||
ser.getFrame(i, (char*)img.data);
|
||
double var = laplacian((uint16_t*)img.data, (int32_t*)out.data, img.cols, img.rows);
|
||
double minval, maxval;
|
||
cv::minMaxLoc(out, &minval, &maxval);
|
||
out.convertTo(out, CV_32F, 1.0 / (maxval - minval), -minval / (maxval - minval));
|
||
qDebug() << "minmax" << minval << maxval;
|
||
cv::imshow("lap", out);
|
||
|
||
img.convertTo(imgf32, CV_32F);
|
||
cv::Laplacian(imgf32, lap, CV_32F, 1);
|
||
cv::minMaxLoc(lap, &minval, &maxval);
|
||
qDebug() << "minmax" << minval << maxval;
|
||
cv::Mat stddev;
|
||
cv::Mat mean;
|
||
cv::meanStdDev(lap, mean, stddev);
|
||
lap -= minval;
|
||
lap /= (maxval - minval);
|
||
cv::imshow("lapcv", lap);
|
||
cv::waitKey();
|
||
qDebug() << var << std::sqrt(var) << stddev.at<double>(0);
|
||
//continue;
|
||
return 0;
|
||
|
||
img.convertTo(imgf32, CV_32F);
|
||
cv::Laplacian(imgf32, lap, CV_32F, 1);
|
||
cv::Point2d off = cv::phaseCorrelate(first(rect), imgf32(rect));
|
||
if(maxQ < stddev.at<double>(0))
|
||
{
|
||
maxQ = stddev.at<double>(0);
|
||
img.copyTo(best);
|
||
//qDebug() << "new best" << i;
|
||
}
|
||
}
|
||
cv::imshow("lap", best);
|
||
cv::waitKeyEx();
|
||
|
||
return 0;
|
||
|
||
|
||
cv::Mat img1= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/R.tif", cv::IMREAD_GRAYSCALE);
|
||
cv::Mat img2= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/G.tif", cv::IMREAD_GRAYSCALE);
|
||
|
||
img1.convertTo(img1, CV_32F);
|
||
img2.convertTo(img2, CV_32F);
|
||
|
||
cv::Point2d point = cv::phaseCorrelate(img1, img2);
|
||
|
||
|
||
cv::Mat img2dst;
|
||
cv::Mat t(2, 3, CV_32F);
|
||
t.at<float>(0, 0) = 1.0;
|
||
t.at<float>(1, 1) = 1.0;
|
||
t.at<float>(0, 2) = -point.x;
|
||
t.at<float>(1, 2) = -point.y;
|
||
cv::warpAffine(img2, img2dst, t, img1.size());
|
||
|
||
//cv::imshow("img1", img1 / 64.0);
|
||
auto diff = img1 - img2dst;
|
||
double min, max;
|
||
cv::minMaxLoc(diff, &min, &max);
|
||
qDebug() << min << max;
|
||
cv::imshow("img2", (diff - min) / (32));
|
||
cv::imwrite("diff.png", (diff - min) / (max - min) * 255);
|
||
cv::imwrite("avg.png", (img1 + img2dst) * 0.5);
|
||
|
||
cv::minMaxLoc(img1, &min, &max);
|
||
qDebug() << min << max;
|
||
cv::waitKey();
|
||
|
||
|
||
return 0;
|
||
/*QApplication a(argc, argv);
|
||
MainWindow w;
|
||
w.show();
|
||
return a.exec();*/
|
||
}
|