tenmon/rawimage_sse.cpp
2025-02-16 15:19:20 +01:00

116 lines
4.8 KiB
C++

#ifdef __SSE2__
#include <x86intrin.h>
#include <cstdint>
#include <limits>
template<typename T, int ch>
void fromPlanarSSE(const void *in, void *out, size_t count)
{
const __m128i *_in[4] = {(const __m128i*) static_cast<const T*>(in),
(const __m128i*)(static_cast<const T*>(in) + count),
(const __m128i*)(static_cast<const T*>(in) + count*2),
(const __m128i*)(static_cast<const T*>(in) + count*3)};
__m128i *_out = (__m128i*)out;
size_t s2 = count;
if constexpr(sizeof(T) == 1)
{
count /= 16;
__m128i a = _mm_set1_epi8(-1);
for(size_t i = 0; i < count; i++)
{
__m128i r = _mm_loadu_si128(_in[0] + i);
__m128i g = _mm_loadu_si128(_in[1] + i);
__m128i b = _mm_loadu_si128(_in[2] + i);
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
__m128i d1 = _mm_unpacklo_epi8(r, b);
__m128i d2 = _mm_unpacklo_epi8(g, a);
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi8(d1, d2));
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi8(d1, d2));
d1 = _mm_unpackhi_epi8(r, b);
d2 = _mm_unpackhi_epi8(g, a);
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi8(d1, d2));
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi8(d1, d2));
}
count *= 16;
}
if constexpr(sizeof(T) == 2)
{
count /= 8;
__m128i a = _mm_set1_epi16(-1);
for(size_t i = 0; i < count; i++)
{
__m128i r = _mm_loadu_si128(_in[0] + i);
__m128i g = _mm_loadu_si128(_in[1] + i);
__m128i b = _mm_loadu_si128(_in[2] + i);
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
__m128i d1 = _mm_unpacklo_epi16(r, b);
__m128i d2 = _mm_unpacklo_epi16(g, a);
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi16(d1, d2));
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi16(d1, d2));
d1 = _mm_unpackhi_epi16(r, b);
d2 = _mm_unpackhi_epi16(g, a);
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi16(d1, d2));
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi16(d1, d2));
}
count *= 8;
}
if constexpr(sizeof(T) == 4)
{
count /= 4;
__m128i a = _mm_set1_epi32(-1);
if constexpr(!std::numeric_limits<T>::is_integer)a = _mm_castps_si128(_mm_set1_ps(1.0));
for(size_t i = 0; i < count; i++)
{
__m128i r = _mm_loadu_si128(_in[0] + i);
__m128i g = _mm_loadu_si128(_in[1] + i);
__m128i b = _mm_loadu_si128(_in[2] + i);
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
__m128i d1 = _mm_unpacklo_epi32(r, b);
__m128i d2 = _mm_unpacklo_epi32(g, a);
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi32(d1, d2));
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi32(d1, d2));
d1 = _mm_unpackhi_epi32(r, b);
d2 = _mm_unpackhi_epi32(g, a);
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi32(d1, d2));
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi32(d1, d2));
}
count *= 4;
}
for(size_t i = count; i < s2; i++)
{
switch(sizeof(T))
{
case 1:
for(uint32_t o=0; o<ch; o++)static_cast<uint8_t*>(out)[i*4 + o] = static_cast<const uint8_t*>(in)[i + o*s2];
if(ch==3)static_cast<uint8_t*>(out)[i*4 + 3] = 0xff;
break;
case 2:
for(uint32_t o=0; o<ch; o++)static_cast<uint16_t*>(out)[i*4 + o] = static_cast<const uint16_t*>(in)[i + o*s2];
if(ch==3)static_cast<uint16_t*>(out)[i*4 + 3] = 0xffff;
break;
case 4:
for(uint32_t o=0; o<ch; o++)static_cast<uint32_t*>(out)[i*4 + o] = static_cast<const uint32_t*>(in)[i + o*s2];
if(ch==3)
{
if(!std::numeric_limits<T>::is_integer)static_cast<float*>(out)[i*4 + 3] = 1.0;
else static_cast<uint32_t*>(out)[i*4 + 3] = 0xffffffff;
}
break;
}
}
}
template void fromPlanarSSE<uint8_t, 3>(const void *in, void *out, size_t count);
template void fromPlanarSSE<uint8_t, 4>(const void *in, void *out, size_t count);
template void fromPlanarSSE<uint16_t, 3>(const void *in, void *out, size_t count);
template void fromPlanarSSE<uint16_t, 4>(const void *in, void *out, size_t count);
template void fromPlanarSSE<uint32_t, 3>(const void *in, void *out, size_t count);
template void fromPlanarSSE<uint32_t, 4>(const void *in, void *out, size_t count);
template void fromPlanarSSE<float, 3>(const void *in, void *out, size_t count);
template void fromPlanarSSE<float, 4>(const void *in, void *out, size_t count);
#endif