Mudbox/SSE.h Source File

SSE.h
Go to the documentation of this file.
1 //**************************************************************************/
2 // Copyright (c) 2008 Autodesk, Inc.
3 // All rights reserved.
4 //
5 // Use of this software is subject to the terms of the Autodesk license
6 // agreement provided at the time of installation or download, or which
7 // otherwise accompanies this software in either electronic or hard copy form.
8 //
9 //**************************************************************************/
10 // DESCRIPTION:
11 // CREATED: October 2008
12 //**************************************************************************/
13 
14 #ifndef __MUDBOXSDK_SSE_H__
15 #define __MUDBOXSDK_SSE_H__
16 
17 #if defined(JAMBUILD)
18 #include <Mudbox/mudbox.h>
19 #else
20 #include "mudbox.h"
21 #endif
22 
23 #if defined(__GNUC__)
24 
25 #ifndef __SSE3__
26 #error This file was intended to compiled with SSE3 instruction set enabled.
27 #endif
28 
29 //
30 // If you are using GCC instead of the Intel C Compiler, don't forget
31 // to specify -I/usr/lib/gcc/i686-apple-darwin9/4.0.1/include when compiling
32 // a file that use this header.
33 //
34 #include <xmmintrin.h>
35 #include <pmmintrin.h>
36 #endif
37 
38 #if defined(WIN32) || defined(WIN64)
39 #include <xmmintrin.h>
40 #include <intrin.h>
41 #endif
42 
43 #ifndef MB_SSE_ALIGN16_VAR
44 
45 #if defined(_MSC_VER)
46 #define MB_SSE_ALIGN16_VAR(v) __declspec(align(16)) v
47 #define MB_SSE_ALIGN16_CLASS __declspec(align(16))
48 #ifndef __attribute__
49 #define __attribute__(_a)
50 #endif
51 #elif defined(__GNUC__)
52 #define MB_SSE_ALIGN16_VAR(v) v __attribute__ ((aligned(16)))
53 #define MB_SSE_ALIGN16_CLASS __attribute__ ((aligned(16)))
54 #ifndef __forceinline
55 #define __forceinline inline
56 #endif
57 #else
58 #error The MB_SSE_ALIGN16_VAR needs to be ported for this compiler.
59 #endif
60 
61 #endif
62 
65 {
66  inline HWVector( void ) __attribute__((always_inline)){};
67 
68  inline void setZero( void ) __attribute__((always_inline))
69  {
70  v = _mm_setzero_ps();
71  }
72 
73  inline HWVector( float f ) __attribute__((always_inline))
74  {
75  v = _mm_set_ps1( f );
76  }
77 
78  inline HWVector( const mudbox::Vector &h ) __attribute__((always_inline))
79  {
80  v = _mm_set_ps( h.x, h.y, h.z, 0);
81  };
82  inline HWVector( float x, float y, float z, float w = 0 ) __attribute__((always_inline))
83  {
84  v = _mm_set_ps( x,y,z,w );
85  };
86  void Fill( const mudbox::Vector &vVector );
87  inline HWVector( const float a[4] ) __attribute__((always_inline))
88  {
89  v = _mm_loadu_ps( a );
90  };
91  inline HWVector ShiftLeft( void ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); return r; };
92  inline void ShiftLeft(HWVector &result) __attribute__((always_inline)){ result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
93  inline void ShiftLeftInPlace() __attribute__((always_inline)){ v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
94  inline HWVector ShiftRight( void ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); return r; };
95  inline void ShiftRight( HWVector &result ) __attribute__((always_inline)){ result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); };
96  inline HWVector operator &( HWVector &o ) __attribute__((always_inline))
97  {
98  //HWVector al = (*this).ShiftLeft(), bl = o.ShiftLeft(), ar = (*this).ShiftRight(), br = o.ShiftRight();
99  HWVector al, bl, ar, br;
100  ShiftLeft(al);
101  o.ShiftLeft(bl);
102  ShiftRight(ar);
103  o.ShiftRight(br);
104 
105  return al*br-ar*bl;
106  };
107 
108  inline HWVector operator |( const HWVector &o ) const __attribute__((always_inline))
109  {
110  HWVector r;
111  r.v = _mm_mul_ps( v, o.v );
112  // Requires SSE3
113  r.v = _mm_hadd_ps( r.v, r.v );
114  r.v = _mm_hadd_ps( r.v, r.v );
115  return r;
116  };
117 
118  inline HWVector Length( void ) const __attribute__((always_inline))
119  {
120  HWVector r = operator |( *this );
121  r.v = _mm_sqrt_ss( r.v );
122  return r;
123  };
124 
125  inline HWVector LengthSquared( void ) const __attribute__((always_inline))
126  {
127  HWVector r = operator |( *this );
128  return r;
129  };
130 
131  inline float DistanceFromLine( const HWVector &vStart,const HWVector &vEnd ) const __attribute__((always_inline))
132  {
133  HWVector n = vEnd - vStart;
134  HWVector m = (*this)-vStart;
135  HWVector h = n&m;
136  HWVector d = n&h;
137  d.Normalize();
138  float fDistance = (d|(*this))-(d|vStart);
139  return fDistance>0?fDistance:(-fDistance);
140  }
141 
142  inline void Normalize( void ) __attribute__((always_inline))
143  {
144  HWVector f = operator |( *this );
145  f.v = _mm_rsqrt_ps( f.v );
146  v = _mm_mul_ps( f.v, v );
147  };
148 
149  inline HWVector Floor( void ) __attribute__((always_inline))
150  {
151  static const unsigned int a = (1 << 23);
152  static const float twoTo23AsFloat = (float)a;
153  static const __m128 twoTo23 = _mm_set_ps( twoTo23AsFloat, twoTo23AsFloat, twoTo23AsFloat, twoTo23AsFloat );
154  // b = fabs(v)
155  __m128 b = _mm_castsi128_ps(_mm_srli_epi32( _mm_slli_epi32( _mm_castps_si128(v),1 ), 1 ));
156  // The essence of the floor routine
157  __m128 d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v, twoTo23 ), twoTo23 ), twoTo23 ), twoTo23 );
158  // set mask to all 1s if v >= 2**23
159  __m128 largeMaskE = _mm_cmpgt_ps( b, twoTo23 );
160  // Check for possible off by one error
161  __m128 g = _mm_cmplt_ps( v, d );
162  // Convert positive check result to -1.0, negative to 0.0
163  __m128 h = _mm_cvtepi32_ps( _mm_castps_si128(g) );
164  // Add in the error if there is one
165  __m128 t = _mm_add_ps( d, h );
166  // Select between output result and input value based on v >= 2**23
167  __m128 w = _mm_and_ps( v, largeMaskE );
168  t = _mm_andnot_ps( largeMaskE, t );
169  HWVector vResult;
170  vResult.v = _mm_or_ps( t, w );
171  return vResult;
172  };
173 
174  inline HWVector Minimum( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_min_ps( v, o.v ); return r; };
175  inline HWVector Maximum( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_max_ps( v, o.v ); return r; };
176  inline HWVector operator +( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_add_ps( v, o.v ); return r; };
177  inline HWVector operator -( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_sub_ps( v, o.v ); return r; };
178  inline void operator +=( const HWVector &o ) __attribute__((always_inline)){ v = _mm_add_ps( v, o.v ); };
179  inline void operator -=( const HWVector &o ) __attribute__((always_inline)){ v = _mm_sub_ps( v, o.v ); };
180  inline void operator *=( const HWVector &o ) __attribute__((always_inline)){ v = _mm_mul_ps( v, o.v ); };
181  inline void operator /=( const HWVector &o ) __attribute__((always_inline)){ v = _mm_div_ps( v, o.v ); };
182  inline HWVector operator *( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_mul_ps( v, o.v ); return r; };
183  inline HWVector operator *( float f ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_mul_ps( v, _mm_set1_ps( f ) ); return r; };
184  inline HWVector operator /( const HWVector &o ) const __attribute__((always_inline)){ HWVector r; r.v = _mm_div_ps( v, o.v ); return r; };
185  inline void operator *=( float f ) __attribute__((always_inline)){ v = _mm_mul_ps( v, _mm_set1_ps(f ) ); };
186  inline void Store( float *p ) __attribute__((always_inline)){ _mm_storeu_ps( p, v ); };
187  inline void Load( float f ) __attribute__((always_inline)) { v = _mm_set_ps1( f ); };
188  inline void StoreNormalAsInt( int *pBuffer ) const __attribute__((always_inline))
189  {
190  //static __declspec(align(16)) float c[4] = { 32766.0f, 32766.0f, 32766.0f, 32766.0f };
191  // instead of the correct value, we use a little bit smaller number, because after
192  // normalization a component can be a littlebit bigger than 1.0. in that case storing it in
193  // a 16 bit integer would overflow, and artifacts on the surface would appear.
194  static const MB_SSE_ALIGN16_VAR(float c[4]) = { 32740.0f, 32740.0f, 32740.0f, 32740.0f };
195 
196  __m128 f = _mm_load_ps( c );
197  f = _mm_mul_ps( f, v );
198  __m128i i = _mm_cvtps_epi32( f );
199  _mm_storeu_si128( (__m128i *)pBuffer, i );
200  };
201  inline void StoreAsInt( int *pBuffer ) const __attribute__((always_inline))
202  {
203  __m128i i = _mm_cvtps_epi32( v );
204  _mm_storeu_si128( (__m128i *)pBuffer, i );
205  };
206  inline void StoreAsIntTruncate( int *pBuffer ) const __attribute__((always_inline))
207  {
208  __m128i i = _mm_cvttps_epi32( v );
209  _mm_storeu_si128( (__m128i *)pBuffer, i );
210  };
211  inline operator float( void ) const __attribute__((always_inline))
212  {
213  MB_SSE_ALIGN16_VAR(float f);
214  _mm_store_ss( &f, v );
215  return f;
216  };
217  inline operator mudbox::Vector( void ) const __attribute__((always_inline))
218  {
220  __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
221  _mm_store_ss( &r.x, t );
222  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
223  _mm_store_ss( &r.y, t );
224  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
225  _mm_store_ss( &r.z, t );
226  return r;
227  };
228  inline operator mudbox::Vector4( void ) const __attribute__((always_inline))
229  {
231  __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
232  _mm_store_ss( &r.x, t );
233  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
234  _mm_store_ss( &r.y, t );
235  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
236  _mm_store_ss( &r.z, t );
237  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
238  _mm_store_ss( &r.w, t );
239  return r;
240  };
241 
242  MB_SSE_ALIGN16_VAR(__m128 v);
243 };
244 
246 {
247  __m128 t = _mm_shuffle_ps( r.v, r.v, _MM_SHUFFLE(2,1,0,3) );
248  _mm_store_ss( &v.x, t );
249  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
250  _mm_store_ss( &v.y, t );
251  t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
252  _mm_store_ss( &v.z, t );
253  return v;
254 
256  //__m128 t = _mm_shuffle_ps( r.v, r.v, _MM_SHUFFLE(0,1,2,3) );
257  //int a = ((int *)&v)[3];
258  //_mm_storeu_ps( &v.x, t );
259  //((int *)&v)[3] = a;
260  //return v;
261 };
262 
264 
265 struct HWMatrix
266 {
267  inline HWMatrix( void ) __attribute__((always_inline)){};
268  inline HWMatrix( const mudbox::Matrix &m ) __attribute__((always_inline))
269  {
270  r0 = _mm_loadu_ps( &m._11 );
271  r1 = _mm_loadu_ps( &m._21 );
272  r2 = _mm_loadu_ps( &m._31 );
273  r3 = _mm_loadu_ps( &m._41 );
274  };
275  void MirrorX( void ) __attribute__((always_inline))
276  {
277  r0 = _mm_shuffle_ps( r0, r0, _MM_SHUFFLE(0, 1, 2, 3) );
278  r1 = _mm_shuffle_ps( r1, r1, _MM_SHUFFLE(0, 1, 2, 3) );
279  r2 = _mm_shuffle_ps( r2, r2, _MM_SHUFFLE(0, 1, 2, 3) );
280  r3 = _mm_shuffle_ps( r3, r3, _MM_SHUFFLE(0, 1, 2, 3) );
281  };
282  inline HWVector Transform( const HWVector &v, float w = 1.0f ) const __attribute__((always_inline))
283  {
284  // Represents v * M
285  // v is a row vector.
286 
287  MB_SSE_ALIGN16_VAR(float c[4]) = { w, w, w, w };
288 
289  __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
290  __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
291  __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
292  __m128 v3 = _mm_load_ps( c );
293 
294  __m128 a0 = _mm_mul_ps( v0, r0 );
295  __m128 a1 = _mm_mul_ps( v1, r1 );
296  __m128 a2 = _mm_mul_ps( v2, r2 );
297  __m128 a3 = _mm_mul_ps( v3, r3 );
298 
299  __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
300 
301  HWVector z;
302  z.v = r;
303  return z;
304  };
305  inline HWVector ProjectedTransform( const HWVector &v ) const __attribute__((always_inline))
306  {
307  static MB_SSE_ALIGN16_VAR(float c[4]) = { 1, 1, 1, 1 };
308 
309  __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
310  __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
311  __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
312  __m128 v3 = _mm_load_ps( c );
313 
314  __m128 a0 = _mm_mul_ps( v0, r0 );
315  __m128 a1 = _mm_mul_ps( v1, r1 );
316  __m128 a2 = _mm_mul_ps( v2, r2 );
317  __m128 a3 = _mm_mul_ps( v3, r3 );
318 
319  __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
320  __m128 d = _mm_shuffle_ps( r, r, _MM_SHUFFLE(3,3,3,3) );
321 
322  HWVector z;
323  z.v = _mm_div_ps( r, d );
324  return z;
325  };
326 
327  MB_SSE_ALIGN16_VAR(__m128 r0);
328  MB_SSE_ALIGN16_VAR(__m128 r1);
329  MB_SSE_ALIGN16_VAR(__m128 r2);
330  MB_SSE_ALIGN16_VAR(__m128 r3);
331 };
332 
333 
334 //-----------------------------------------------------------------------------
336 bool MBDLL_DECL hasSSE3();
337 
339 bool MBDLL_DECL hasSSE41();
340 
342 bool MBDLL_DECL hasSSE42();
343 
345 bool MBDLL_DECL hasAVX256();
346 
347 //-----------------------------------------------------------------------------
348 
349 #endif
HWVector LengthSquared(void) const __attribute__((always_inline))
Definition: SSE.h:125
GLdouble GLdouble GLdouble r
Definition: GLee.h:1189
float z
Definition: math.h:340
bool MBDLL_DECL hasAVX256()
Query if AVX is available. present on Sandybridge.
const QPoint operator/(const QPoint &p, qreal c)
Definition: qpoint.h:201
GLenum GLint GLint y
Definition: GLee.h:876
bool MBDLL_DECL hasSSE41()
Query if SSE4.1 is available. Present on Penryn (later Core 2)
HWVector(const float a[4]) __attribute__((always_inline))
Definition: SSE.h:87
Represents a 3D vector or point with S23E8 floating point elements.
Definition: math.h:35
HWVector Floor(void) __attribute__((always_inline))
Definition: SSE.h:149
GLdouble GLdouble z
Definition: GLee.h:1393
This class represents a four dimensional vector stored in the SSE registers.
Definition: SSE.h:64
QByteArray & operator+=(QByteArray &a, const QStringBuilder< A, B > &b)
float z
Definition: math.h:632
HWVector Length(void) const __attribute__((always_inline))
Definition: SSE.h:118
GLfloat GLfloat GLfloat v2
Definition: GLee.h:1736
HWMatrix(void) __attribute__((always_inline))
Definition: SSE.h:267
HWVector Minimum(const HWVector &o) const __attribute__((always_inline))
Definition: SSE.h:174
mudbox::Vector & operator<<(mudbox::Vector &v, const HWVector &r)
Definition: SSE.h:245
HWVector(float x, float y, float z, float w=0) __attribute__((always_inline))
Definition: SSE.h:82
float y
Definition: math.h:632
This class represents a four by four matrix stored in the SSE registers.
Definition: SSE.h:265
Q_CORE_EXPORT QBitArray operator&(const QBitArray &, const QBitArray &)
float y
Definition: math.h:340
float DistanceFromLine(const HWVector &vStart, const HWVector &vEnd) const __attribute__((always_inline))
Definition: SSE.h:131
bool MBDLL_DECL hasSSE42()
Query if SSE4.2 is available. Present on Nehalem (Core i5, Corei7)
GLfloat GLfloat v1
Definition: GLee.h:1735
GLubyte g
Definition: GLee.h:5404
float w
Definition: math.h:632
This class represents a 4x4 transformation matrix.
Definition: math.h:1122
HWVector ShiftLeft(void) const __attribute__((always_inline))
Definition: SSE.h:91
HWVector(void) __attribute__((always_inline))
Definition: SSE.h:66
bool MBDLL_DECL hasSSE3()
Query if SSE3 is available. Present on Prescott, Core2 and later.
void ShiftRight(HWVector &result) __attribute__((always_inline))
Definition: SSE.h:95
GLenum GLint x
Definition: GLee.h:876
A four dimensionsional vector (X, Y, Z, and W)
Definition: math.h:617
const QByteArray operator+(const QByteArray &a1, const QByteArray &a2)
Definition: qbytearray.h:564
GLfloat v0
Definition: GLee.h:1734
GLenum GLsizei n
Definition: GLee.h:3432
HWVector(const mudbox::Vector &h) __attribute__((always_inline))
Definition: SSE.h:78
const GLdouble * v
Definition: GLee.h:1174
HWVector ProjectedTransform(const HWVector &v) const __attribute__((always_inline))
Definition: SSE.h:305
GLubyte GLubyte b
Definition: GLee.h:5404
void Normalize(void) __attribute__((always_inline))
Definition: SSE.h:142
void ShiftLeft(HWVector &result) __attribute__((always_inline))
Definition: SSE.h:92
GLfloat GLfloat p
Definition: GLee.h:5416
MBDLL_DECL Vector operator*(float f, const Vector &v)
Multiplies a float scalar value by a vector, the result is a vector.
Definition: math.h:575
const GLubyte * c
Definition: GLee.h:5419
int int int int int int h
Definition: GLee.h:10534
HWVector(float f) __attribute__((always_inline))
Definition: SSE.h:73
HWVector Maximum(const HWVector &o) const __attribute__((always_inline))
Definition: SSE.h:175
void setZero(void) __attribute__((always_inline))
Definition: SSE.h:68
GLubyte GLubyte GLubyte a
Definition: GLee.h:5404
float x
Definition: math.h:632
HWVector Transform(const HWVector &v, float w=1.0f) const __attribute__((always_inline))
Definition: SSE.h:282
const QPoint operator-(const QPoint &p1, const QPoint &p2)
Definition: qpoint.h:170
void Store(float *p) __attribute__((always_inline))
Definition: SSE.h:186
void Load(float f) __attribute__((always_inline))
Definition: SSE.h:187
void MirrorX(void) __attribute__((always_inline))
Definition: SSE.h:275
MB_SSE_ALIGN16_VAR(__m128 r0)
void StoreAsIntTruncate(int *pBuffer) const __attribute__((always_inline))
Definition: SSE.h:206
Q_CORE_EXPORT QBitArray operator|(const QBitArray &, const QBitArray &)
GLubyte GLubyte GLubyte GLubyte w
Definition: GLee.h:1775
HWVector ShiftRight(void) const __attribute__((always_inline))
Definition: SSE.h:94
GLdouble GLdouble t
Definition: GLee.h:1181
GLfloat GLfloat GLfloat GLfloat v3
Definition: GLee.h:1737
float x
Definition: math.h:340
GLclampf f
Definition: GLee.h:9303
void ShiftLeftInPlace() __attribute__((always_inline))
Definition: SSE.h:93
#define MBDLL_DECL
Definition: dllinterface.h:35
HWMatrix(const mudbox::Matrix &m) __attribute__((always_inline))
Definition: SSE.h:268
void StoreNormalAsInt(int *pBuffer) const __attribute__((always_inline))
Definition: SSE.h:188
void StoreAsInt(int *pBuffer) const __attribute__((always_inline))
Definition: SSE.h:201