116 lines
4.8 KiB
C++
116 lines
4.8 KiB
C++
#ifdef __SSE2__
|
|
#include <x86intrin.h>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
|
|
template<typename T, int ch>
|
|
void fromPlanarSSE(const void *in, void *out, size_t count)
|
|
{
|
|
const __m128i *_in[4] = {(const __m128i*) static_cast<const T*>(in),
|
|
(const __m128i*)(static_cast<const T*>(in) + count),
|
|
(const __m128i*)(static_cast<const T*>(in) + count*2),
|
|
(const __m128i*)(static_cast<const T*>(in) + count*3)};
|
|
__m128i *_out = (__m128i*)out;
|
|
size_t s2 = count;
|
|
if constexpr(sizeof(T) == 1)
|
|
{
|
|
count /= 16;
|
|
__m128i a = _mm_set1_epi8(-1);
|
|
for(size_t i = 0; i < count; i++)
|
|
{
|
|
__m128i r = _mm_loadu_si128(_in[0] + i);
|
|
__m128i g = _mm_loadu_si128(_in[1] + i);
|
|
__m128i b = _mm_loadu_si128(_in[2] + i);
|
|
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
|
|
|
|
__m128i d1 = _mm_unpacklo_epi8(r, b);
|
|
__m128i d2 = _mm_unpacklo_epi8(g, a);
|
|
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi8(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi8(d1, d2));
|
|
d1 = _mm_unpackhi_epi8(r, b);
|
|
d2 = _mm_unpackhi_epi8(g, a);
|
|
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi8(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi8(d1, d2));
|
|
}
|
|
count *= 16;
|
|
}
|
|
if constexpr(sizeof(T) == 2)
|
|
{
|
|
count /= 8;
|
|
__m128i a = _mm_set1_epi16(-1);
|
|
for(size_t i = 0; i < count; i++)
|
|
{
|
|
__m128i r = _mm_loadu_si128(_in[0] + i);
|
|
__m128i g = _mm_loadu_si128(_in[1] + i);
|
|
__m128i b = _mm_loadu_si128(_in[2] + i);
|
|
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
|
|
|
|
__m128i d1 = _mm_unpacklo_epi16(r, b);
|
|
__m128i d2 = _mm_unpacklo_epi16(g, a);
|
|
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi16(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi16(d1, d2));
|
|
d1 = _mm_unpackhi_epi16(r, b);
|
|
d2 = _mm_unpackhi_epi16(g, a);
|
|
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi16(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi16(d1, d2));
|
|
}
|
|
count *= 8;
|
|
}
|
|
if constexpr(sizeof(T) == 4)
|
|
{
|
|
count /= 4;
|
|
__m128i a = _mm_set1_epi32(-1);
|
|
if constexpr(!std::numeric_limits<T>::is_integer)a = _mm_castps_si128(_mm_set1_ps(1.0));
|
|
for(size_t i = 0; i < count; i++)
|
|
{
|
|
__m128i r = _mm_loadu_si128(_in[0] + i);
|
|
__m128i g = _mm_loadu_si128(_in[1] + i);
|
|
__m128i b = _mm_loadu_si128(_in[2] + i);
|
|
if constexpr(ch==4)a = _mm_loadu_si128(_in[3] + i);
|
|
|
|
__m128i d1 = _mm_unpacklo_epi32(r, b);
|
|
__m128i d2 = _mm_unpacklo_epi32(g, a);
|
|
_mm_storeu_si128(_out + i*4, _mm_unpacklo_epi32(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 1, _mm_unpackhi_epi32(d1, d2));
|
|
d1 = _mm_unpackhi_epi32(r, b);
|
|
d2 = _mm_unpackhi_epi32(g, a);
|
|
_mm_storeu_si128(_out + i*4 + 2, _mm_unpacklo_epi32(d1, d2));
|
|
_mm_storeu_si128(_out + i*4 + 3, _mm_unpackhi_epi32(d1, d2));
|
|
}
|
|
count *= 4;
|
|
}
|
|
for(size_t i = count; i < s2; i++)
|
|
{
|
|
switch(sizeof(T))
|
|
{
|
|
case 1:
|
|
for(uint32_t o=0; o<ch; o++)static_cast<uint8_t*>(out)[i*4 + o] = static_cast<const uint8_t*>(in)[i + o*s2];
|
|
if(ch==3)static_cast<uint8_t*>(out)[i*4 + 3] = 0xff;
|
|
break;
|
|
case 2:
|
|
for(uint32_t o=0; o<ch; o++)static_cast<uint16_t*>(out)[i*4 + o] = static_cast<const uint16_t*>(in)[i + o*s2];
|
|
if(ch==3)static_cast<uint16_t*>(out)[i*4 + 3] = 0xffff;
|
|
break;
|
|
case 4:
|
|
for(uint32_t o=0; o<ch; o++)static_cast<uint32_t*>(out)[i*4 + o] = static_cast<const uint32_t*>(in)[i + o*s2];
|
|
if(ch==3)
|
|
{
|
|
if(!std::numeric_limits<T>::is_integer)static_cast<float*>(out)[i*4 + 3] = 1.0;
|
|
else static_cast<uint32_t*>(out)[i*4 + 3] = 0xffffffff;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
template void fromPlanarSSE<uint8_t, 3>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<uint8_t, 4>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<uint16_t, 3>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<uint16_t, 4>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<uint32_t, 3>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<uint32_t, 4>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<float, 3>(const void *in, void *out, size_t count);
|
|
template void fromPlanarSSE<float, 4>(const void *in, void *out, size_t count);
|
|
|
|
#endif
|