#include "mainwindow.h" #include "serfile.h" #include #include #include #include #include #include #include #include #include // Twiddle factor struct struct Twiddle { float re; float im; }; // Multiply (a + i b) * (c + i d) = (a*c - b*d) + i(a*d + b*c) inline __m256 cmul_avx2(__m256 are, __m256 aim, __m256 bre, __m256 bim) { // (are + i aim) * (bre + i bim) __m256 ac = _mm256_mul_ps(are, bre); __m256 bd = _mm256_mul_ps(aim, bim); __m256 ad = _mm256_mul_ps(are, bim); __m256 bc = _mm256_mul_ps(aim, bre); // real = ac - bd __m256 real = _mm256_sub_ps(ac, bd); // imag = ad + bc __m256 imag = _mm256_add_ps(ad, bc); // We pack real, imag as [r0, i0, r1, i1, ..., r3, i3] // But because we process 4 complex numbers in the vector, we need shuffling // to interleave real and imag properly. But for simplicity, assume the calling // code expects separate real & imag vectors, or we’ll implement interleaving. // Here, return real in lower 128 bits & imag in upper, or some scheme. // For clarity: return real in output lane 0‑127, imag in 128‑255. // But better is to use separate vectors for real & imag, or AoS with shuffles. // Pack: in low half real, in high half imag return _mm256_blend_ps(real, imag, 0xF0); // 0xF0 = upper 4 lanes from imag } // Precompute twiddles static std::vector make_twiddles(int N) { std::vector W(N/2); const float PI = std::acos(-1.0f); for(int k = 0; k < N/2; ++k) { float angle = -2.0f * PI * k / N; W[k].re = std::cos(angle); W[k].im = std::sin(angle); } return W; } // Stockham FFT with AVX2 for complex (AoS: interleaved real, imag) void stockham_fft_avx2(std::complex* data, std::complex* temp, int N, bool inverse = false) { assert((N & (N - 1)) == 0); // power of two auto W = make_twiddles(N); const float inv_sign = inverse ? +1.0f : -1.0f; std::complex* in = data; std::complex* out = temp; int logN = 0; while ((1 << logN) < N) ++logN; for(int stage = 0; stage < logN; ++stage) { int m = 1 << (stage + 1); int half_m = m >> 1; // stride between groups int group_stride = N / m; for(int k = 0; k < N; k += m) { for(int j = 0; j < half_m; ++j) { // twiddle W_index: int w_index = j * group_stride; float w_re = W[w_index].re; float w_im = inv_sign * W[w_index].im; // invert sign for inverse // Load w_re, w_im (we can broadcast them) __m256 w_re_b = _mm256_set1_ps(w_re); __m256 w_im_b = _mm256_set1_ps(w_im); // Process 4 complex numbers at once in the j position strides // The 4 complex numbers are from positions: // in[k + j + 0*half_m], in[k + j + 1*half_m], in[k + j + 2*half_m], in[k + j + 3*half_m] // But that depends on how many half_m, whether half_m >=4 etc. // For simplicity, require half_m >=4 in vectorized branch. if (half_m >= 4 && (j + 3*half_m) < N) { // Load real parts float *ptr_u = reinterpret_cast(&in[k + j]); float *ptr_t0 = reinterpret_cast(&in[k + j + half_m]); // Assuming interleaved: data layout: [Re0, Im0, Re1, Im1, ...] // We need gather 4 complex u's and t's with step half_m. // Load u (4 complex): u0, u1, u2, u3 __m256 u0 = _mm256_loadu_ps(reinterpret_cast(&in[k + j])); __m256 t0 = _mm256_loadu_ps(reinterpret_cast(&in[k + j + half_m])); // Complex multiply t0 * w // Split t0 into re, im __m256 t0_re = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 2, 0)); // pick re lanes __m256 t0_im = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 1, 3, 1)); // pick im __m256 mul = cmul_avx2(t0_re, t0_im, w_re_b, w_im_b); // Now compute: // out[k/2 + j + 0] = u + t * w // out[k/2 + j + N/2 + j] = u - t * w // But with vector, we do elementwise addition/subtraction __m256 sum = _mm256_add_ps(u0, mul); __m256 diff = _mm256_sub_ps(u0, mul); // Store sum and diff to their respective locations in out // Need to compute positions: // Position for “sum”: std::complex* out_sum = &out[k/2 + j]; std::complex* out_diff = &out[k/2 + j + N/2]; // Store _mm256_storeu_ps(reinterpret_cast(out_sum), sum); _mm256_storeu_ps(reinterpret_cast(out_diff), diff); } else { // Fallback scalar for j's not fitting vectorization auto u = in[k + j]; auto t = in[k + j + half_m] * std::complex(w_re, w_im); out[k/2 + j] = u + t; out[k/2 + j + N/2] = u - t; } } } // Swap in/out buffers std::swap(in, out); } // If number of stages is odd, data is currently in temp if (logN & 1) { for(int i = 0; i < N; ++i) data[i] = in[i]; } // Normalize for inverse if (inverse) { float invN = 1.0f / N; for(int i = 0; i < N; ++i) { data[i] *= invN; } } } void fft(std::vector> &x, bool inv = false) { const size_t N = x.size(); if (N <= 1) return; // Bit-reversed addressing permutation size_t j = 0; for(size_t i = 1; i < N; ++i) { size_t bit = N >> 1; while(j & bit) { j ^= bit; bit >>= 1; } j ^= bit; if (i < j) { std::swap(x[i], x[j]); } } // Iterative FFT for(size_t len = 2; len <= N; len <<= 1) { double angle = inv ? (2 * M_PI / len) : (-2 * M_PI / len); std::complex wlen(std::cos(angle), std::sin(angle)); for(size_t i = 0; i < N; i += len) { std::complex w(1); for(size_t j = 0; j < len / 2; ++j) { std::complex u = x[i + j]; std::complex v = x[i + j + len / 2] * w; x[i + j] = u + v; x[i + j + len / 2] = u - v; w *= wlen; } } } if(inv) { for(size_t i = 0; i < N; i++) x[i] /= N; } } double laplacian(const uint16_t *img, int32_t *out, uint32_t width, uint32_t height) { __m256 mean = _mm256_setzero_ps(); __m256 M2 = _mm256_setzero_ps(); uint32_t count = 0; for(uint32_t y = 1; y < height - 1; y++) { uint32_t row = (y - 1) * width; for(uint32_t x = 1; x < width - 17; x += 16) { __m256i p0 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + row + x)); __m256i p1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x - 1)); __m256i p2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x)); __m256i p3 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width) + x + 1)); __m256i p4 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(img + (row + width * 2) + x)); __m256i sumA = _mm256_setzero_si256(); __m256i sumB = _mm256_setzero_si256(); __m256i a,b; a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 0)); b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p0, 1)); sumA = _mm256_add_epi32(sumA, a); sumB = _mm256_add_epi32(sumB, b); a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 0)); b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p1, 1)); sumA = _mm256_add_epi32(sumA, a); sumB = _mm256_add_epi32(sumB, b); a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 0)); b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p2, 1)); a = _mm256_sll_epi32(a, _mm_set1_epi64x(2)); b = _mm256_sll_epi32(b, _mm_set1_epi64x(2)); sumA = _mm256_sub_epi32(sumA, a); sumB = _mm256_sub_epi32(sumB, b); a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 0)); b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p3, 1)); sumA = _mm256_add_epi32(sumA, a); sumB = _mm256_add_epi32(sumB, b); a = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 0)); b = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p4, 1)); sumA = _mm256_add_epi32(sumA, a); sumB = _mm256_add_epi32(sumB, b); if(out) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x), sumA); _mm256_storeu_si256(reinterpret_cast<__m256i*>(out + row + x + 8), sumB); } __m256 af = _mm256_cvtepi32_ps(sumA); __m256 bf = _mm256_cvtepi32_ps(sumB); count++; __m256 delta = _mm256_sub_ps(af, mean); mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast(count)))); __m256 delta2 = _mm256_sub_ps(af, mean); M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2)); count++; delta = _mm256_sub_ps(bf, mean); mean = _mm256_add_ps(mean, _mm256_div_ps(delta, _mm256_set1_ps(static_cast(count)))); delta2 = _mm256_sub_ps(bf, mean); M2 = _mm256_add_ps(M2, _mm256_mul_ps(delta, delta2)); //count += 1 //delta = new_value - mean //mean += delta / count //delta2 = new_value - mean //M2 += delta * delta2 } } float mean_2[8]; float M2_2[8]; _mm256_storeu_ps(mean_2, mean); _mm256_storeu_ps(M2_2, M2); auto welford_merge = [](uint32_t n, float &mean_1, float mean_2, float &M2_1, float M2_2) { uint32_t count = 2 * n; float delta = mean_2 - mean_1; float mean = mean_1 + delta * ((float)n / count); float M2 = M2_1 + M2_2 + delta * delta * n * n / count; mean_1 = mean; M2_1 = M2; }; for(int i = 0; i < 8; i++) qDebug() << M2_2[i] / count; welford_merge(count, mean_2[0], mean_2[1], M2_2[0], M2_2[1]); welford_merge(count, mean_2[2], mean_2[3], M2_2[2], M2_2[3]); welford_merge(count, mean_2[4], mean_2[5], M2_2[4], M2_2[5]); welford_merge(count, mean_2[6], mean_2[7], M2_2[6], M2_2[7]); welford_merge(count * 2, mean_2[0], mean_2[2], M2_2[0], M2_2[2]); welford_merge(count * 2, mean_2[4], mean_2[6], M2_2[4], M2_2[6]); welford_merge(count * 4, mean_2[0], mean_2[4], M2_2[0], M2_2[4]); return (double)M2_2[0] / (count * 8); } int main(int argc, char *argv[]) { SERFileReader ser; ser.open("/home/nou/.wine/drive_c/indi_2025-10-03/indi_record_2025-10-03@18-24-37.ser"); cv::Rect rect(1024, 1024, 128, 128); double maxQ = 0; cv::Mat best; cv::Mat lap; cv::Mat first(ser.height(), ser.width(), CV_16U); cv::Mat img(ser.height(), ser.width(), CV_16U); cv::Mat out(ser.height(), ser.width(), CV_32S); cv::Mat imgf32; ser.getFrame(0, (char*)first.data); first.convertTo(first, CV_32F); for(uint32_t i = 0; i < ser.frameCount(); i++) { ser.getFrame(i, (char*)img.data); double var = laplacian((uint16_t*)img.data, (int32_t*)out.data, img.cols, img.rows); double minval, maxval; cv::minMaxLoc(out, &minval, &maxval); out.convertTo(out, CV_32F, 1.0 / (maxval - minval), -minval / (maxval - minval)); qDebug() << "minmax" << minval << maxval; cv::imshow("lap", out); img.convertTo(imgf32, CV_32F); cv::Laplacian(imgf32, lap, CV_32F, 1); cv::minMaxLoc(lap, &minval, &maxval); qDebug() << "minmax" << minval << maxval; cv::Mat stddev; cv::Mat mean; cv::meanStdDev(lap, mean, stddev); lap -= minval; lap /= (maxval - minval); cv::imshow("lapcv", lap); cv::waitKey(); qDebug() << var << std::sqrt(var) << stddev.at(0); //continue; return 0; img.convertTo(imgf32, CV_32F); cv::Laplacian(imgf32, lap, CV_32F, 1); cv::Point2d off = cv::phaseCorrelate(first(rect), imgf32(rect)); if(maxQ < stddev.at(0)) { maxQ = stddev.at(0); img.copyTo(best); //qDebug() << "new best" << i; } } cv::imshow("lap", best); cv::waitKeyEx(); return 0; cv::Mat img1= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/R.tif", cv::IMREAD_GRAYSCALE); cv::Mat img2= cv::imread("/home/nou/Obrázky/astro/moon_2025-10-03/G.tif", cv::IMREAD_GRAYSCALE); img1.convertTo(img1, CV_32F); img2.convertTo(img2, CV_32F); cv::Point2d point = cv::phaseCorrelate(img1, img2); cv::Mat img2dst; cv::Mat t(2, 3, CV_32F); t.at(0, 0) = 1.0; t.at(1, 1) = 1.0; t.at(0, 2) = -point.x; t.at(1, 2) = -point.y; cv::warpAffine(img2, img2dst, t, img1.size()); //cv::imshow("img1", img1 / 64.0); auto diff = img1 - img2dst; double min, max; cv::minMaxLoc(diff, &min, &max); qDebug() << min << max; cv::imshow("img2", (diff - min) / (32)); cv::imwrite("diff.png", (diff - min) / (max - min) * 255); cv::imwrite("avg.png", (img1 + img2dst) * 0.5); cv::minMaxLoc(img1, &min, &max); qDebug() << min << max; cv::waitKey(); return 0; /*QApplication a(argc, argv); MainWindow w; w.show(); return a.exec();*/ }