30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
42 #include <emmintrin.h>
46 #include <smmintrin.h>
49 #include "gdal_priv_templates.hpp"
51 static inline __m128i GDALCopyInt16ToXMM(
const void* ptr)
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
56 return _mm_cvtsi32_si128(s);
58 return _mm_cvtsi32_si128(*
static_cast<const unsigned short*
>(ptr));
62 static inline __m128i GDALCopyInt32ToXMM(
const void* ptr)
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
67 return _mm_cvtsi32_si128(i);
69 return _mm_cvtsi32_si128(*
static_cast<const GInt32*
>(ptr));
73 static inline __m128i GDALCopyInt64ToXMM(
const void* ptr)
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
78 return _mm_cvtsi64_si128(i);
80 return _mm_cvtsi64_si128(*
static_cast<const GInt64*
>(ptr));
84 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void* pDest)
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
90 *
static_cast<GInt16*
>(pDest) =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
100 #pragma GCC diagnostic push
101 #pragma GCC diagnostic ignored "-Weffc++"
104 XMMReg2Double() =
default;
105 #if defined(__GNUC__)
106 #pragma GCC diagnostic pop
109 XMMReg2Double(
double val): xmm(_mm_load_sd (&val)) {}
110 XMMReg2Double(
const XMMReg2Double& other) : xmm(other.xmm) {}
112 static inline XMMReg2Double Zero()
119 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
122 reg.nsLoad1ValHighAndLow(ptr);
126 static inline XMMReg2Double Load2Val(
const double* ptr)
133 static inline XMMReg2Double Load2Val(
const float* ptr)
140 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
143 reg.nsLoad2ValAligned(ptr);
147 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
154 static inline XMMReg2Double Load2Val(
const short* ptr)
161 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
168 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
171 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
175 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
178 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
182 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
185 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
189 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
192 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
196 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
199 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
203 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
206 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
210 inline void nsLoad1ValHighAndLow(
const double* ptr)
212 xmm = _mm_load1_pd(ptr);
215 inline void nsLoad2Val(
const double* ptr)
217 xmm = _mm_loadu_pd(ptr);
220 inline void nsLoad2ValAligned(
const double* ptr)
222 xmm = _mm_load_pd(ptr);
225 inline void nsLoad2Val(
const float* ptr)
227 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
230 inline void nsLoad2Val(
const unsigned char* ptr)
232 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
234 xmm_i = _mm_cvtepu8_epi32(xmm_i);
236 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
239 xmm = _mm_cvtepi32_pd(xmm_i);
242 inline void nsLoad2Val(
const short* ptr)
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
246 xmm_i = _mm_cvtepi16_epi32(xmm_i);
248 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
249 xmm_i = _mm_srai_epi32(xmm_i, 16);
251 xmm = _mm_cvtepi32_pd(xmm_i);
254 inline void nsLoad2Val(
const unsigned short* ptr)
256 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
258 xmm_i = _mm_cvtepu16_epi32(xmm_i);
260 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128());
262 xmm = _mm_cvtepi32_pd(xmm_i);
265 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
267 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
269 xmm_i = _mm_cvtepu8_epi32(xmm_i);
271 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
274 low.xmm = _mm_cvtepi32_pd(xmm_i);
275 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
278 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
281 high.nsLoad2Val(ptr+2);
284 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
287 high.nsLoad2Val(ptr+2);
290 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
293 high.nsLoad2Val(ptr+2);
296 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
298 __m128 temp1 = _mm_loadu_ps(ptr);
299 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300 low.xmm = _mm_cvtps_pd(temp1);
301 high.xmm = _mm_cvtps_pd(temp2);
304 inline void Zeroize()
306 xmm = _mm_setzero_pd();
309 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
315 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
317 xmm = _mm_add_pd(xmm, other.xmm);
321 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
323 xmm = _mm_mul_pd(xmm, other.xmm);
327 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const
330 ret.xmm = _mm_add_pd(xmm, other.xmm);
334 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const
337 ret.xmm = _mm_sub_pd(xmm, other.xmm);
341 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const
344 ret.xmm = _mm_mul_pd(xmm, other.xmm);
348 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const
351 ret.xmm = _mm_div_pd(xmm, other.xmm);
355 inline double GetHorizSum()
const
358 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
359 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
362 inline void Store2Val(
double* ptr)
const
364 _mm_storeu_pd(ptr, xmm);
367 inline void Store2ValAligned(
double* ptr)
const
369 _mm_store_pd(ptr, xmm);
372 inline void Store2Val(
float* ptr)
const
374 __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64*
>(ptr));
378 inline void Store2Val(
unsigned char* ptr)
const
380 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
381 tmp = _mm_packs_epi32(tmp, tmp);
382 tmp = _mm_packus_epi16(tmp, tmp);
383 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16*
>(ptr));
386 inline void Store2Val(
unsigned short* ptr)
const
388 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
390 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
391 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32*
>(ptr));
394 inline void StoreMask(
unsigned char* ptr)
const
396 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(ptr), _mm_castpd_si128(xmm) );
399 inline operator double ()
const
401 return _mm_cvtsd_f64(xmm);
407 #ifndef NO_WARN_USE_SSE2_EMULATION
408 #warning "Software emulation of SSE2 !"
418 XMMReg2Double(
double val) { low = val; high = 0.0; }
419 XMMReg2Double(
const XMMReg2Double& other) : low(other.low), high(other.high) {}
421 static inline XMMReg2Double Zero()
428 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
431 reg.nsLoad1ValHighAndLow(ptr);
435 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
439 if (expr1.low == expr2.low)
440 memset(&(reg.low), 0xFF,
sizeof(
double));
444 if (expr1.high == expr2.high)
445 memset(&(reg.high), 0xFF,
sizeof(
double));
452 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
456 if (expr1.low != expr2.low)
457 memset(&(reg.low), 0xFF,
sizeof(
double));
461 if (expr1.high != expr2.high)
462 memset(&(reg.high), 0xFF,
sizeof(
double));
469 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
473 if (expr1.low > expr2.low)
474 memset(&(reg.low), 0xFF,
sizeof(
double));
478 if (expr1.high > expr2.high)
479 memset(&(reg.high), 0xFF,
sizeof(
double));
486 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
489 int low1[2], high1[2];
490 int low2[2], high2[2];
491 memcpy(low1, &expr1.low,
sizeof(
double));
492 memcpy(high1, &expr1.high,
sizeof(
double));
493 memcpy(low2, &expr2.low,
sizeof(
double));
494 memcpy(high2, &expr2.high,
sizeof(
double));
497 high1[0] &= high2[0];
498 high1[1] &= high2[1];
499 memcpy(®.low, low1,
sizeof(
double));
500 memcpy(®.high, high1,
sizeof(
double));
504 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
508 reg.low = true_expr.low;
510 reg.low = false_expr.low;
512 reg.high = true_expr.high;
514 reg.high = false_expr.high;
518 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
521 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
522 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
526 static inline XMMReg2Double Load2Val(
const double* ptr)
533 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
536 reg.nsLoad2ValAligned(ptr);
540 static inline XMMReg2Double Load2Val(
const float* ptr)
547 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
554 static inline XMMReg2Double Load2Val(
const short* ptr)
561 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
568 inline void nsLoad1ValHighAndLow(
const double* ptr)
574 inline void nsLoad2Val(
const double* ptr)
580 inline void nsLoad2ValAligned(
const double* ptr)
586 inline void nsLoad2Val(
const float* ptr)
592 inline void nsLoad2Val(
const unsigned char* ptr)
598 inline void nsLoad2Val(
const short* ptr)
604 inline void nsLoad2Val(
const unsigned short* ptr)
610 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
618 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
621 high.nsLoad2Val(ptr+2);
624 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
627 high.nsLoad2Val(ptr+2);
630 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
633 high.nsLoad2Val(ptr+2);
636 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
639 high.nsLoad2Val(ptr+2);
642 inline void Zeroize()
648 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
655 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
662 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
669 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const
672 ret.low = low + other.low;
673 ret.high = high + other.high;
677 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const
680 ret.low = low - other.low;
681 ret.high = high - other.high;
685 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const
688 ret.low = low * other.low;
689 ret.high = high * other.high;
693 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const
696 ret.low = low / other.low;
697 ret.high = high / other.high;
701 inline double GetHorizSum()
const
706 inline void Store2Val(
double* ptr)
const
712 inline void Store2ValAligned(
double* ptr)
const
718 inline void Store2Val(
float* ptr)
const
724 void Store2Val(
unsigned char* ptr)
const
726 ptr[0] = (
unsigned char)(low + 0.5);
727 ptr[1] = (
unsigned char)(high + 0.5);
730 void Store2Val(
unsigned short* ptr)
const
733 ptr[1] = (
GUInt16)(high + 0.5);
736 inline void StoreMask(
unsigned char* ptr)
const
738 memcpy(ptr, &low, 8);
739 memcpy(ptr + 8, &high, 8);
742 inline operator double ()
const
752 #include <immintrin.h>
759 XMMReg4Double(): ymm(_mm256_setzero_pd()) {}
760 XMMReg4Double(
const XMMReg4Double& other) : ymm(other.ymm) {}
762 static inline XMMReg4Double Zero()
769 inline void Zeroize()
771 ymm = _mm256_setzero_pd();
774 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
777 reg.nsLoad1ValHighAndLow(ptr);
781 inline void nsLoad1ValHighAndLow(
const double* ptr)
783 ymm = _mm256_set1_pd(*ptr);
786 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
793 inline void nsLoad4Val(
const unsigned char* ptr)
795 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
796 xmm_i = _mm_cvtepu8_epi32(xmm_i);
797 ymm = _mm256_cvtepi32_pd(xmm_i);
800 static inline XMMReg4Double Load4Val(
const short* ptr)
807 inline void nsLoad4Val(
const short* ptr)
809 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
810 xmm_i = _mm_cvtepi16_epi32(xmm_i);
811 ymm = _mm256_cvtepi32_pd(xmm_i);
814 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
821 inline void nsLoad4Val(
const unsigned short* ptr)
823 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
824 xmm_i = _mm_cvtepu16_epi32(xmm_i);
825 ymm = _mm256_cvtepi32_pd(xmm_i);
828 static inline XMMReg4Double Load4Val(
const double* ptr)
835 inline void nsLoad4Val(
const double* ptr)
837 ymm = _mm256_loadu_pd(ptr);
840 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
843 reg.nsLoad4ValAligned(ptr);
847 inline void nsLoad4ValAligned(
const double* ptr)
849 ymm = _mm256_load_pd(ptr);
852 static inline XMMReg4Double Load4Val(
const float* ptr)
859 inline void nsLoad4Val(
const float* ptr)
861 ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
864 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
867 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
871 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
874 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
878 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
881 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
885 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
888 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
892 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
895 reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
899 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
902 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
906 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
912 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
914 ymm = _mm256_add_pd(ymm, other.ymm);
918 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
920 ymm = _mm256_mul_pd(ymm, other.ymm);
924 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const
927 ret.ymm = _mm256_add_pd(ymm, other.ymm);
931 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const
934 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
938 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const
941 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
945 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const
948 ret.ymm = _mm256_div_pd(ymm, other.ymm);
952 void AddToLow(
const XMMReg2Double& other )
954 __m256d ymm2 = _mm256_setzero_pd();
955 ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
956 ymm = _mm256_add_pd(ymm, ymm2);
959 inline double GetHorizSum()
const
961 __m256d ymm_tmp1, ymm_tmp2;
962 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
963 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
964 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
965 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
968 inline void Store4Val(
unsigned char* ptr)
const
970 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
973 xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
974 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32*
>(ptr));
977 inline void Store4Val(
unsigned short* ptr)
const
979 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
980 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
981 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64*
>(ptr));
984 inline void Store4Val(
float* ptr)
const
986 _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
989 inline void Store4Val(
double* ptr)
const
991 _mm256_storeu_pd(ptr, ymm);
994 inline void StoreMask(
unsigned char* ptr)
const
996 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(ptr), _mm256_castpd_si256(ymm) );
1005 XMMReg2Double low, high;
1007 #if defined(__GNUC__)
1008 #pragma GCC diagnostic push
1009 #pragma GCC diagnostic ignored "-Weffc++"
1012 XMMReg4Double() =
default;
1013 #if defined(__GNUC__)
1014 #pragma GCC diagnostic pop
1017 XMMReg4Double(
const XMMReg4Double& other) : low(other.low), high(other.high) {}
1019 static inline XMMReg4Double Zero()
1027 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
1030 reg.low.nsLoad1ValHighAndLow(ptr);
1035 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
1038 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1042 static inline XMMReg4Double Load4Val(
const short* ptr)
1045 reg.low.nsLoad2Val(ptr);
1046 reg.high.nsLoad2Val(ptr+2);
1050 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
1053 reg.low.nsLoad2Val(ptr);
1054 reg.high.nsLoad2Val(ptr+2);
1058 static inline XMMReg4Double Load4Val(
const double* ptr)
1061 reg.low.nsLoad2Val(ptr);
1062 reg.high.nsLoad2Val(ptr+2);
1066 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
1069 reg.low.nsLoad2ValAligned(ptr);
1070 reg.high.nsLoad2ValAligned(ptr+2);
1074 static inline XMMReg4Double Load4Val(
const float* ptr)
1077 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1081 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1084 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1085 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1089 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1092 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1093 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1097 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1100 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1101 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1105 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1108 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1109 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1113 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
1116 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1117 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1121 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1124 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1125 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1129 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
1136 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
1143 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
1150 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const
1153 ret.low = low + other.low;
1154 ret.high = high + other.high;
1158 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const
1161 ret.low = low - other.low;
1162 ret.high = high - other.high;
1166 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const
1169 ret.low = low * other.low;
1170 ret.high = high * other.high;
1174 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const
1177 ret.low = low / other.low;
1178 ret.high = high / other.high;
1182 void AddToLow(
const XMMReg2Double& other )
1187 inline double GetHorizSum()
const
1189 return (low + high).GetHorizSum();
1192 inline void Store4Val(
unsigned char* ptr)
const
1194 #ifdef USE_SSE2_EMULATION
1196 high.Store2Val(ptr+2);
1198 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(low.xmm, _mm_set1_pd(0.5)));
1199 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(high.xmm, _mm_set1_pd(0.5)));
1200 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow), _mm_castsi128_ps(tmpHigh), _MM_SHUFFLE(1, 0, 1, 0)));
1201 tmp = _mm_packs_epi32(tmp, tmp);
1202 tmp = _mm_packus_epi16(tmp, tmp);
1203 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32*
>(ptr));
1207 inline void Store4Val(
unsigned short* ptr)
const
1211 high.Store2Val(ptr+2);
1213 __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1214 __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1215 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1217 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1219 xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1220 xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1221 xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1223 GDALCopyXMMToInt64(xmm0, (
GInt64*)ptr);
1227 inline void Store4Val(
float* ptr)
const
1230 high.Store2Val(ptr+2);
1233 inline void Store4Val(
double* ptr)
const
1236 high.Store2Val(ptr+2);
1239 inline void StoreMask(
unsigned char* ptr)
const
1242 high.StoreMask(ptr+16);
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:211
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:263
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:213
int GInt32
Int32 type.
Definition: cpl_port.h:205