GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id: gdalsse_priv.h 207e8bcf055703689d991e1278fe08478cfd7956 2020-12-19 17:03:27 +0100 Even Rouault $
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40 
41 /* Requires SSE2 */
42 #include <emmintrin.h>
43 #include <string.h>
44 
45 #ifdef __SSE4_1__
46 #include <smmintrin.h>
47 #endif
48 
49 #include "gdal_priv_templates.hpp"
50 
51 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52 {
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54  unsigned short s;
55  memcpy(&s, ptr, 2);
56  return _mm_cvtsi32_si128(s);
57 #else
58  return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
59 #endif
60 }
61 
62 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63 {
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65  GInt32 i;
66  memcpy(&i, ptr, 4);
67  return _mm_cvtsi32_si128(i);
68 #else
69  return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
70 #endif
71 }
72 
73 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74 {
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76  GInt64 i;
77  memcpy(&i, ptr, 8);
78  return _mm_cvtsi64_si128(i);
79 #else
80  return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
81 #endif
82 }
83 
84 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85 {
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88  memcpy(pDest, &i, 2);
89 #else
90  *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91 #endif
92 }
93 
94 class XMMReg2Double
95 {
96  public:
97  __m128d xmm;
98 
99 #if defined(__GNUC__)
100 #pragma GCC diagnostic push
101 #pragma GCC diagnostic ignored "-Weffc++"
102 #endif
103  /* coverity[uninit_member] */
104  XMMReg2Double() = default;
105 #if defined(__GNUC__)
106 #pragma GCC diagnostic pop
107 #endif
108 
109  XMMReg2Double(double val): xmm(_mm_load_sd (&val)) {}
110  XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
111 
112  static inline XMMReg2Double Zero()
113  {
114  XMMReg2Double reg;
115  reg.Zeroize();
116  return reg;
117  }
118 
119  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
120  {
121  XMMReg2Double reg;
122  reg.nsLoad1ValHighAndLow(ptr);
123  return reg;
124  }
125 
126  static inline XMMReg2Double Load2Val(const double* ptr)
127  {
128  XMMReg2Double reg;
129  reg.nsLoad2Val(ptr);
130  return reg;
131  }
132 
133  static inline XMMReg2Double Load2Val(const float* ptr)
134  {
135  XMMReg2Double reg;
136  reg.nsLoad2Val(ptr);
137  return reg;
138  }
139 
140  static inline XMMReg2Double Load2ValAligned(const double* ptr)
141  {
142  XMMReg2Double reg;
143  reg.nsLoad2ValAligned(ptr);
144  return reg;
145  }
146 
147  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
148  {
149  XMMReg2Double reg;
150  reg.nsLoad2Val(ptr);
151  return reg;
152  }
153 
154  static inline XMMReg2Double Load2Val(const short* ptr)
155  {
156  XMMReg2Double reg;
157  reg.nsLoad2Val(ptr);
158  return reg;
159  }
160 
161  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
162  {
163  XMMReg2Double reg;
164  reg.nsLoad2Val(ptr);
165  return reg;
166  }
167 
168  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169  {
170  XMMReg2Double reg;
171  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
172  return reg;
173  }
174 
175  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176  {
177  XMMReg2Double reg;
178  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
179  return reg;
180  }
181 
182  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183  {
184  XMMReg2Double reg;
185  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186  return reg;
187  }
188 
189  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
190  {
191  XMMReg2Double reg;
192  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193  return reg;
194  }
195 
196  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
197  {
198  XMMReg2Double reg;
199  reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
200  return reg;
201  }
202 
203  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
204  {
205  XMMReg2Double reg;
206  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
207  return reg;
208  }
209 
210  inline void nsLoad1ValHighAndLow(const double* ptr)
211  {
212  xmm = _mm_load1_pd(ptr);
213  }
214 
215  inline void nsLoad2Val(const double* ptr)
216  {
217  xmm = _mm_loadu_pd(ptr);
218  }
219 
220  inline void nsLoad2ValAligned(const double* ptr)
221  {
222  xmm = _mm_load_pd(ptr);
223  }
224 
225  inline void nsLoad2Val(const float* ptr)
226  {
227  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
228  }
229 
230  inline void nsLoad2Val(const unsigned char* ptr)
231  {
232  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
233 #ifdef __SSE4_1__
234  xmm_i = _mm_cvtepu8_epi32(xmm_i);
235 #else
236  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
238 #endif
239  xmm = _mm_cvtepi32_pd(xmm_i);
240  }
241 
242  inline void nsLoad2Val(const short* ptr)
243  {
244  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245 #ifdef __SSE4_1__
246  xmm_i = _mm_cvtepi16_epi32(xmm_i);
247 #else
248  xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
249  xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
250 #endif
251  xmm = _mm_cvtepi32_pd(xmm_i);
252  }
253 
254  inline void nsLoad2Val(const unsigned short* ptr)
255  {
256  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
257 #ifdef __SSE4_1__
258  xmm_i = _mm_cvtepu16_epi32(xmm_i);
259 #else
260  xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
261 #endif
262  xmm = _mm_cvtepi32_pd(xmm_i);
263  }
264 
265  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
266  {
267  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
268 #ifdef __SSE4_1__
269  xmm_i = _mm_cvtepu8_epi32(xmm_i);
270 #else
271  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
273 #endif
274  low.xmm = _mm_cvtepi32_pd(xmm_i);
275  high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
276  }
277 
278  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
279  {
280  low.nsLoad2Val(ptr);
281  high.nsLoad2Val(ptr+2);
282  }
283 
284  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
285  {
286  low.nsLoad2Val(ptr);
287  high.nsLoad2Val(ptr+2);
288  }
289 
290  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
291  {
292  low.nsLoad2Val(ptr);
293  high.nsLoad2Val(ptr+2);
294  }
295 
296  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
297  {
298  __m128 temp1 = _mm_loadu_ps(ptr);
299  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300  low.xmm = _mm_cvtps_pd(temp1);
301  high.xmm = _mm_cvtps_pd(temp2);
302  }
303 
304  inline void Zeroize()
305  {
306  xmm = _mm_setzero_pd();
307  }
308 
309  inline XMMReg2Double& operator= (const XMMReg2Double& other)
310  {
311  xmm = other.xmm;
312  return *this;
313  }
314 
315  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
316  {
317  xmm = _mm_add_pd(xmm, other.xmm);
318  return *this;
319  }
320 
321  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
322  {
323  xmm = _mm_mul_pd(xmm, other.xmm);
324  return *this;
325  }
326 
327  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
328  {
329  XMMReg2Double ret;
330  ret.xmm = _mm_add_pd(xmm, other.xmm);
331  return ret;
332  }
333 
334  inline XMMReg2Double operator- (const XMMReg2Double& other) const
335  {
336  XMMReg2Double ret;
337  ret.xmm = _mm_sub_pd(xmm, other.xmm);
338  return ret;
339  }
340 
341  inline XMMReg2Double operator* (const XMMReg2Double& other) const
342  {
343  XMMReg2Double ret;
344  ret.xmm = _mm_mul_pd(xmm, other.xmm);
345  return ret;
346  }
347 
348  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
349  {
350  XMMReg2Double ret;
351  ret.xmm = _mm_div_pd(xmm, other.xmm);
352  return ret;
353  }
354 
355  inline double GetHorizSum() const
356  {
357  __m128d xmm2;
358  xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
359  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360  }
361 
362  inline void Store2Val(double* ptr) const
363  {
364  _mm_storeu_pd(ptr, xmm);
365  }
366 
367  inline void Store2ValAligned(double* ptr) const
368  {
369  _mm_store_pd(ptr, xmm);
370  }
371 
372  inline void Store2Val(float* ptr) const
373  {
374  __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
376  }
377 
378  inline void Store2Val(unsigned char* ptr) const
379  {
380  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381  tmp = _mm_packs_epi32(tmp, tmp);
382  tmp = _mm_packus_epi16(tmp, tmp);
383  GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
384  }
385 
386  inline void Store2Val(unsigned short* ptr) const
387  {
388  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
389  // X X X X 0 B 0 A --> X X X X A A B A
390  tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
391  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
392  }
393 
394  inline void StoreMask(unsigned char* ptr) const
395  {
396  _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
397  }
398 
399  inline operator double () const
400  {
401  return _mm_cvtsd_f64(xmm);
402  }
403 };
404 
405 #else
406 
407 #ifndef NO_WARN_USE_SSE2_EMULATION
408 #warning "Software emulation of SSE2 !"
409 #endif
410 
411 class XMMReg2Double
412 {
413  public:
414  double low;
415  double high;
416 
417  XMMReg2Double() {}
418  XMMReg2Double(double val) { low = val; high = 0.0; }
419  XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
420 
421  static inline XMMReg2Double Zero()
422  {
423  XMMReg2Double reg;
424  reg.Zeroize();
425  return reg;
426  }
427 
428  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
429  {
430  XMMReg2Double reg;
431  reg.nsLoad1ValHighAndLow(ptr);
432  return reg;
433  }
434 
435  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
436  {
437  XMMReg2Double reg;
438 
439  if (expr1.low == expr2.low)
440  memset(&(reg.low), 0xFF, sizeof(double));
441  else
442  reg.low = 0;
443 
444  if (expr1.high == expr2.high)
445  memset(&(reg.high), 0xFF, sizeof(double));
446  else
447  reg.high = 0;
448 
449  return reg;
450  }
451 
452  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
453  {
454  XMMReg2Double reg;
455 
456  if (expr1.low != expr2.low)
457  memset(&(reg.low), 0xFF, sizeof(double));
458  else
459  reg.low = 0;
460 
461  if (expr1.high != expr2.high)
462  memset(&(reg.high), 0xFF, sizeof(double));
463  else
464  reg.high = 0;
465 
466  return reg;
467  }
468 
469  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
470  {
471  XMMReg2Double reg;
472 
473  if (expr1.low > expr2.low)
474  memset(&(reg.low), 0xFF, sizeof(double));
475  else
476  reg.low = 0;
477 
478  if (expr1.high > expr2.high)
479  memset(&(reg.high), 0xFF, sizeof(double));
480  else
481  reg.high = 0;
482 
483  return reg;
484  }
485 
486  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
487  {
488  XMMReg2Double reg;
489  int low1[2], high1[2];
490  int low2[2], high2[2];
491  memcpy(low1, &expr1.low, sizeof(double));
492  memcpy(high1, &expr1.high, sizeof(double));
493  memcpy(low2, &expr2.low, sizeof(double));
494  memcpy(high2, &expr2.high, sizeof(double));
495  low1[0] &= low2[0];
496  low1[1] &= low2[1];
497  high1[0] &= high2[0];
498  high1[1] &= high2[1];
499  memcpy(&reg.low, low1, sizeof(double));
500  memcpy(&reg.high, high1, sizeof(double));
501  return reg;
502  }
503 
504  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
505  {
506  XMMReg2Double reg;
507  if( cond.low )
508  reg.low = true_expr.low;
509  else
510  reg.low = false_expr.low;
511  if( cond.high )
512  reg.high = true_expr.high;
513  else
514  reg.high = false_expr.high;
515  return reg;
516  }
517 
518  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
519  {
520  XMMReg2Double reg;
521  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
522  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
523  return reg;
524  }
525 
526  static inline XMMReg2Double Load2Val(const double* ptr)
527  {
528  XMMReg2Double reg;
529  reg.nsLoad2Val(ptr);
530  return reg;
531  }
532 
533  static inline XMMReg2Double Load2ValAligned(const double* ptr)
534  {
535  XMMReg2Double reg;
536  reg.nsLoad2ValAligned(ptr);
537  return reg;
538  }
539 
540  static inline XMMReg2Double Load2Val(const float* ptr)
541  {
542  XMMReg2Double reg;
543  reg.nsLoad2Val(ptr);
544  return reg;
545  }
546 
547  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
548  {
549  XMMReg2Double reg;
550  reg.nsLoad2Val(ptr);
551  return reg;
552  }
553 
554  static inline XMMReg2Double Load2Val(const short* ptr)
555  {
556  XMMReg2Double reg;
557  reg.nsLoad2Val(ptr);
558  return reg;
559  }
560 
561  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
562  {
563  XMMReg2Double reg;
564  reg.nsLoad2Val(ptr);
565  return reg;
566  }
567 
568  inline void nsLoad1ValHighAndLow(const double* ptr)
569  {
570  low = ptr[0];
571  high = ptr[0];
572  }
573 
574  inline void nsLoad2Val(const double* ptr)
575  {
576  low = ptr[0];
577  high = ptr[1];
578  }
579 
580  inline void nsLoad2ValAligned(const double* ptr)
581  {
582  low = ptr[0];
583  high = ptr[1];
584  }
585 
586  inline void nsLoad2Val(const float* ptr)
587  {
588  low = ptr[0];
589  high = ptr[1];
590  }
591 
592  inline void nsLoad2Val(const unsigned char* ptr)
593  {
594  low = ptr[0];
595  high = ptr[1];
596  }
597 
598  inline void nsLoad2Val(const short* ptr)
599  {
600  low = ptr[0];
601  high = ptr[1];
602  }
603 
604  inline void nsLoad2Val(const unsigned short* ptr)
605  {
606  low = ptr[0];
607  high = ptr[1];
608  }
609 
610  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
611  {
612  low.low = ptr[0];
613  low.high = ptr[1];
614  high.low = ptr[2];
615  high.high = ptr[3];
616  }
617 
618  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
619  {
620  low.nsLoad2Val(ptr);
621  high.nsLoad2Val(ptr+2);
622  }
623 
624  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
625  {
626  low.nsLoad2Val(ptr);
627  high.nsLoad2Val(ptr+2);
628  }
629 
630  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
631  {
632  low.nsLoad2Val(ptr);
633  high.nsLoad2Val(ptr+2);
634  }
635 
636  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
637  {
638  low.nsLoad2Val(ptr);
639  high.nsLoad2Val(ptr+2);
640  }
641 
642  inline void Zeroize()
643  {
644  low = 0.0;
645  high = 0.0;
646  }
647 
648  inline XMMReg2Double& operator= (const XMMReg2Double& other)
649  {
650  low = other.low;
651  high = other.high;
652  return *this;
653  }
654 
655  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
656  {
657  low += other.low;
658  high += other.high;
659  return *this;
660  }
661 
662  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
663  {
664  low *= other.low;
665  high *= other.high;
666  return *this;
667  }
668 
669  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
670  {
671  XMMReg2Double ret;
672  ret.low = low + other.low;
673  ret.high = high + other.high;
674  return ret;
675  }
676 
677  inline XMMReg2Double operator- (const XMMReg2Double& other) const
678  {
679  XMMReg2Double ret;
680  ret.low = low - other.low;
681  ret.high = high - other.high;
682  return ret;
683  }
684 
685  inline XMMReg2Double operator* (const XMMReg2Double& other) const
686  {
687  XMMReg2Double ret;
688  ret.low = low * other.low;
689  ret.high = high * other.high;
690  return ret;
691  }
692 
693  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
694  {
695  XMMReg2Double ret;
696  ret.low = low / other.low;
697  ret.high = high / other.high;
698  return ret;
699  }
700 
701  inline double GetHorizSum() const
702  {
703  return low + high;
704  }
705 
706  inline void Store2Val(double* ptr) const
707  {
708  ptr[0] = low;
709  ptr[1] = high;
710  }
711 
712  inline void Store2ValAligned(double* ptr) const
713  {
714  ptr[0] = low;
715  ptr[1] = high;
716  }
717 
718  inline void Store2Val(float* ptr) const
719  {
720  ptr[0] = low;
721  ptr[1] = high;
722  }
723 
724  void Store2Val(unsigned char* ptr) const
725  {
726  ptr[0] = (unsigned char)(low + 0.5);
727  ptr[1] = (unsigned char)(high + 0.5);
728  }
729 
730  void Store2Val(unsigned short* ptr) const
731  {
732  ptr[0] = (GUInt16)(low + 0.5);
733  ptr[1] = (GUInt16)(high + 0.5);
734  }
735 
736  inline void StoreMask(unsigned char* ptr) const
737  {
738  memcpy(ptr, &low, 8);
739  memcpy(ptr + 8, &high, 8);
740  }
741 
742  inline operator double () const
743  {
744  return low;
745  }
746 };
747 
748 #endif /* defined(__x86_64) || defined(_M_X64) */
749 
750 #ifdef __AVX__
751 
752 #include <immintrin.h>
753 
754 class XMMReg4Double
755 {
756  public:
757  __m256d ymm;
758 
759  XMMReg4Double(): ymm(_mm256_setzero_pd()) {}
760  XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
761 
762  static inline XMMReg4Double Zero()
763  {
764  XMMReg4Double reg;
765  reg.Zeroize();
766  return reg;
767  }
768 
769  inline void Zeroize()
770  {
771  ymm = _mm256_setzero_pd();
772  }
773 
774  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
775  {
776  XMMReg4Double reg;
777  reg.nsLoad1ValHighAndLow(ptr);
778  return reg;
779  }
780 
781  inline void nsLoad1ValHighAndLow(const double* ptr)
782  {
783  ymm = _mm256_set1_pd(*ptr);
784  }
785 
786  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
787  {
788  XMMReg4Double reg;
789  reg.nsLoad4Val(ptr);
790  return reg;
791  }
792 
793  inline void nsLoad4Val(const unsigned char* ptr)
794  {
795  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
796  xmm_i = _mm_cvtepu8_epi32(xmm_i);
797  ymm = _mm256_cvtepi32_pd(xmm_i);
798  }
799 
800  static inline XMMReg4Double Load4Val(const short* ptr)
801  {
802  XMMReg4Double reg;
803  reg.nsLoad4Val(ptr);
804  return reg;
805  }
806 
807  inline void nsLoad4Val(const short* ptr)
808  {
809  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
810  xmm_i = _mm_cvtepi16_epi32(xmm_i);
811  ymm = _mm256_cvtepi32_pd(xmm_i);
812  }
813 
814  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
815  {
816  XMMReg4Double reg;
817  reg.nsLoad4Val(ptr);
818  return reg;
819  }
820 
821  inline void nsLoad4Val(const unsigned short* ptr)
822  {
823  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
824  xmm_i = _mm_cvtepu16_epi32(xmm_i);
825  ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
826  }
827 
828  static inline XMMReg4Double Load4Val(const double* ptr)
829  {
830  XMMReg4Double reg;
831  reg.nsLoad4Val(ptr);
832  return reg;
833  }
834 
835  inline void nsLoad4Val(const double* ptr)
836  {
837  ymm = _mm256_loadu_pd(ptr);
838  }
839 
840  static inline XMMReg4Double Load4ValAligned(const double* ptr)
841  {
842  XMMReg4Double reg;
843  reg.nsLoad4ValAligned(ptr);
844  return reg;
845  }
846 
847  inline void nsLoad4ValAligned(const double* ptr)
848  {
849  ymm = _mm256_load_pd(ptr);
850  }
851 
852  static inline XMMReg4Double Load4Val(const float* ptr)
853  {
854  XMMReg4Double reg;
855  reg.nsLoad4Val(ptr);
856  return reg;
857  }
858 
859  inline void nsLoad4Val(const float* ptr)
860  {
861  ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
862  }
863 
864  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
865  {
866  XMMReg4Double reg;
867  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
868  return reg;
869  }
870 
871  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
872  {
873  XMMReg4Double reg;
874  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
875  return reg;
876  }
877 
878  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
879  {
880  XMMReg4Double reg;
881  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
882  return reg;
883  }
884 
885  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
886  {
887  XMMReg4Double reg;
888  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
889  return reg;
890  }
891 
892  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
893  {
894  XMMReg4Double reg;
895  reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
896  return reg;
897  }
898 
899  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
900  {
901  XMMReg4Double reg;
902  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
903  return reg;
904  }
905 
906  inline XMMReg4Double& operator= (const XMMReg4Double& other)
907  {
908  ymm = other.ymm;
909  return *this;
910  }
911 
912  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
913  {
914  ymm = _mm256_add_pd(ymm, other.ymm);
915  return *this;
916  }
917 
918  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
919  {
920  ymm = _mm256_mul_pd(ymm, other.ymm);
921  return *this;
922  }
923 
924  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
925  {
926  XMMReg4Double ret;
927  ret.ymm = _mm256_add_pd(ymm, other.ymm);
928  return ret;
929  }
930 
931  inline XMMReg4Double operator- (const XMMReg4Double& other) const
932  {
933  XMMReg4Double ret;
934  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
935  return ret;
936  }
937 
938  inline XMMReg4Double operator* (const XMMReg4Double& other) const
939  {
940  XMMReg4Double ret;
941  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
942  return ret;
943  }
944 
945  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
946  {
947  XMMReg4Double ret;
948  ret.ymm = _mm256_div_pd(ymm, other.ymm);
949  return ret;
950  }
951 
952  void AddToLow( const XMMReg2Double& other )
953  {
954  __m256d ymm2 = _mm256_setzero_pd();
955  ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
956  ymm = _mm256_add_pd(ymm, ymm2);
957  }
958 
959  inline double GetHorizSum() const
960  {
961  __m256d ymm_tmp1, ymm_tmp2;
962  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
963  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
964  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
965  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
966  }
967 
968  inline void Store4Val(unsigned char* ptr) const
969  {
970  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
971  //xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
972  //xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
973  xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); // SSSE3
974  GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
975  }
976 
977  inline void Store4Val(unsigned short* ptr) const
978  {
979  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
980  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
981  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
982  }
983 
984  inline void Store4Val(float* ptr) const
985  {
986  _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
987  }
988 
989  inline void Store4Val(double* ptr) const
990  {
991  _mm256_storeu_pd(ptr, ymm);
992  }
993 
994  inline void StoreMask(unsigned char* ptr) const
995  {
996  _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
997  }
998 };
999 
1000 #else
1001 
1002 class XMMReg4Double
1003 {
1004  public:
1005  XMMReg2Double low, high;
1006 
1007 #if defined(__GNUC__)
1008 #pragma GCC diagnostic push
1009 #pragma GCC diagnostic ignored "-Weffc++"
1010 #endif
1011  /* coverity[uninit_member] */
1012  XMMReg4Double() = default;
1013 #if defined(__GNUC__)
1014 #pragma GCC diagnostic pop
1015 #endif
1016 
1017  XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1018 
1019  static inline XMMReg4Double Zero()
1020  {
1021  XMMReg4Double reg;
1022  reg.low.Zeroize();
1023  reg.high.Zeroize();
1024  return reg;
1025  }
1026 
1027  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1028  {
1029  XMMReg4Double reg;
1030  reg.low.nsLoad1ValHighAndLow(ptr);
1031  reg.high = reg.low;
1032  return reg;
1033  }
1034 
1035  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1036  {
1037  XMMReg4Double reg;
1038  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1039  return reg;
1040  }
1041 
1042  static inline XMMReg4Double Load4Val(const short* ptr)
1043  {
1044  XMMReg4Double reg;
1045  reg.low.nsLoad2Val(ptr);
1046  reg.high.nsLoad2Val(ptr+2);
1047  return reg;
1048  }
1049 
1050  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1051  {
1052  XMMReg4Double reg;
1053  reg.low.nsLoad2Val(ptr);
1054  reg.high.nsLoad2Val(ptr+2);
1055  return reg;
1056  }
1057 
1058  static inline XMMReg4Double Load4Val(const double* ptr)
1059  {
1060  XMMReg4Double reg;
1061  reg.low.nsLoad2Val(ptr);
1062  reg.high.nsLoad2Val(ptr+2);
1063  return reg;
1064  }
1065 
1066  static inline XMMReg4Double Load4ValAligned(const double* ptr)
1067  {
1068  XMMReg4Double reg;
1069  reg.low.nsLoad2ValAligned(ptr);
1070  reg.high.nsLoad2ValAligned(ptr+2);
1071  return reg;
1072  }
1073 
1074  static inline XMMReg4Double Load4Val(const float* ptr)
1075  {
1076  XMMReg4Double reg;
1077  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1078  return reg;
1079  }
1080 
1081  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1082  {
1083  XMMReg4Double reg;
1084  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1085  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1086  return reg;
1087  }
1088 
1089  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1090  {
1091  XMMReg4Double reg;
1092  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1093  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1094  return reg;
1095  }
1096 
1097  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1098  {
1099  XMMReg4Double reg;
1100  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1101  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1102  return reg;
1103  }
1104 
1105  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1106  {
1107  XMMReg4Double reg;
1108  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1109  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1110  return reg;
1111  }
1112 
1113  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1114  {
1115  XMMReg4Double reg;
1116  reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1117  reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1118  return reg;
1119  }
1120 
1121  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1122  {
1123  XMMReg4Double reg;
1124  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1125  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1126  return reg;
1127  }
1128 
1129  inline XMMReg4Double& operator= (const XMMReg4Double& other)
1130  {
1131  low = other.low;
1132  high = other.high;
1133  return *this;
1134  }
1135 
1136  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1137  {
1138  low += other.low;
1139  high += other.high;
1140  return *this;
1141  }
1142 
1143  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1144  {
1145  low *= other.low;
1146  high *= other.high;
1147  return *this;
1148  }
1149 
1150  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1151  {
1152  XMMReg4Double ret;
1153  ret.low = low + other.low;
1154  ret.high = high + other.high;
1155  return ret;
1156  }
1157 
1158  inline XMMReg4Double operator- (const XMMReg4Double& other) const
1159  {
1160  XMMReg4Double ret;
1161  ret.low = low - other.low;
1162  ret.high = high - other.high;
1163  return ret;
1164  }
1165 
1166  inline XMMReg4Double operator* (const XMMReg4Double& other) const
1167  {
1168  XMMReg4Double ret;
1169  ret.low = low * other.low;
1170  ret.high = high * other.high;
1171  return ret;
1172  }
1173 
1174  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1175  {
1176  XMMReg4Double ret;
1177  ret.low = low / other.low;
1178  ret.high = high / other.high;
1179  return ret;
1180  }
1181 
1182  void AddToLow( const XMMReg2Double& other )
1183  {
1184  low += other;
1185  }
1186 
1187  inline double GetHorizSum() const
1188  {
1189  return (low + high).GetHorizSum();
1190  }
1191 
1192  inline void Store4Val(unsigned char* ptr) const
1193  {
1194 #ifdef USE_SSE2_EMULATION
1195  low.Store2Val(ptr);
1196  high.Store2Val(ptr+2);
1197 #else
1198  __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(low.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1199  __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(high.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1200  auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow), _mm_castsi128_ps(tmpHigh), _MM_SHUFFLE(1, 0, 1, 0)));
1201  tmp = _mm_packs_epi32(tmp, tmp);
1202  tmp = _mm_packus_epi16(tmp, tmp);
1203  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
1204 #endif
1205  }
1206 
1207  inline void Store4Val(unsigned short* ptr) const
1208  {
1209 #if 1
1210  low.Store2Val(ptr);
1211  high.Store2Val(ptr+2);
1212 #else
1213  __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1214  __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1215  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1216 #if __SSE4_1__
1217  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1218 #else
1219  xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1220  xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1221  xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1222 #endif
1223  GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1224 #endif
1225  }
1226 
1227  inline void Store4Val(float* ptr) const
1228  {
1229  low.Store2Val(ptr);
1230  high.Store2Val(ptr+2);
1231  }
1232 
1233  inline void Store4Val(double* ptr) const
1234  {
1235  low.Store2Val(ptr);
1236  high.Store2Val(ptr+2);
1237  }
1238 
1239  inline void StoreMask(unsigned char* ptr) const
1240  {
1241  low.StoreMask(ptr);
1242  high.StoreMask(ptr+16);
1243  }
1244 
1245 };
1246 
1247 #endif
1248 
1249 #endif /* #ifndef DOXYGEN_SKIP */
1250 
1251 #endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:211
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:263
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:213
int GInt32
Int32 type.
Definition: cpl_port.h:205