26 #include <xmmintrin.h> 27 #include <pmmintrin.h> 39 _NET(_mm_storeu_ps(x,_p[0]);
printf(
"%e %e %e %e ",x[0],x[1],x[2],x[3]);,
40 _mm_storeu_ps(x,_p[1]);
printf(
"%e %e %e %e ",x[0],x[1],x[2],x[3]);)
45 _NET(_p[0] = _mm_setzero_ps();,
46 _p[1] = _mm_setzero_ps();)
51 _NET(_p[0] = _mm_load1_ps(&a);,
52 _p[1] = _mm_load1_ps(&a);)
57 __m128 _b = _mm_load1_ps(&b);
58 _NET(_a[0] = _mm_mul_ps(_a[0],_b);,
59 _a[1] = _mm_mul_ps(_a[1],_b);)
63 _NET(_a[0] = _mm_mul_ps(_a[0],_b[0]);,
64 _a[1] = _mm_mul_ps(_a[1],_b[1]);)
67 static inline float _sse_mul_ps(__m128* _a, __m128* _b, __m128* _o) {
70 _NET(_o[0] = _mm_mul_ps(_a[0],_b[0]);_mm_storeu_ps(x,_o[0]); out = XSUM(x);,
71 _o[1] = _mm_mul_ps(_a[1],_b[1]);_mm_storeu_ps(x,_o[1]); out+= YSUM(x);)
76 float c[4]; _mm_storeu_ps(c,_c);
78 _NET(_c=_mm_load1_ps( c );*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
79 _NET(_c=_mm_load1_ps(c+1);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
80 _NET(_c=_mm_load1_ps(c+2);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
81 _NET(_c=_mm_load1_ps(c+3);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
84 static inline void _sse_hard4_ps(__m128* _uu, __m128* _am, __m128* _AM, __m128 _c) {
89 float c[4]; _mm_storeu_ps(c,_c);
95 _r = _mm_set1_ps(c[0]); _R = _mm_set1_ps(1-c[0]);
96 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
97 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
100 _r = _mm_set1_ps(c[1]); _R = _mm_set1_ps(1-c[1]);
101 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
102 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
105 _r = _mm_set1_ps(c[2]); _R = _mm_set1_ps(1-c[2]);
106 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
107 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
110 _r = _mm_set1_ps(c[3]); _R = _mm_set1_ps(1-c[3]);
111 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
112 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
118 float c[4]; _mm_storeu_ps(c,_c);
122 _NET(_1 = _mm_set1_ps(c[0]); _0 = _mm_set1_ps(1-c[0]);
123 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
124 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
125 _NET(_1 = _mm_set1_ps(c[1]); _0 = _mm_set1_ps(1-c[1]);
126 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
127 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
128 _NET(_1 = _mm_set1_ps(c[2]); _0 = _mm_set1_ps(1-c[2]);
129 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
130 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
131 _NET(_1 = _mm_set1_ps(c[3]); _0 = _mm_set1_ps(1-c[3]);
132 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
133 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
141 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
142 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
151 _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]))); out = XSUM(x);,
152 _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]))); out+= YSUM(x);
163 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x);,
164 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
167 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x);,
168 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
171 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x);,
172 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
175 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x);,
176 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
178 return _mm_load_ps(o);
185 _mm_set1_ps(1.
e-24))));
195 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x)+1.e-24;,
196 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
199 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x)+1.e-24;,
200 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
203 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x)+1.e-24;,
204 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
207 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x)+1.e-24;,
208 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
210 return _mm_div_ps(_mm_set1_ps(1.),_mm_sqrt_ps(_mm_load_ps(o)));
217 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_b[0])); out = XSUM(x);,
218 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_b[1])); out+= YSUM(x);
226 __m128* _o = (__m128*) o;
230 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0] = XSUM(x);,
231 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0]+= YSUM(x);
234 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1] = XSUM(x);,
235 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1]+= YSUM(x);
238 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2] = XSUM(x);,
239 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2]+= YSUM(x);
242 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3] = XSUM(x);,
243 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3]+= YSUM(x);
250 _NET(_a[0] = _mm_add_ps(_a[0],_b[0]);,
251 _a[1] = _mm_add_ps(_a[1],_b[1]);)
255 static inline void _sse_add_ps(__m128* _a, __m128* _b, __m128 _c) {
257 _NET(_a[0] = _mm_add_ps(_a[0],_mm_mul_ps(_b[0],_c));,
258 _a[1] = _mm_add_ps(_a[1],_mm_mul_ps(_b[1],_c));)
267 float c[4]; _mm_storeu_ps(c,_c);
270 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
271 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
272 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
273 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
274 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
275 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
276 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
277 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
283 _NET(_a[0] = _mm_sub_ps(_a[0],_b[0]);,
284 _a[1] = _mm_sub_ps(_a[1],_b[1]);)
293 float c[4]; _mm_storeu_ps(c,_c);
296 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
297 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
298 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
299 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
300 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
301 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
302 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
303 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
308 _NET(_mm_storeu_ps(a,*_p);,
309 _mm_storeu_ps(a+4,*(_p+1));)
314 _NET(*_a = *_p;, *(_a+1) = *(_p+1);)
320 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
321 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
322 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
323 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
328 __m128 _b = _mm_load1_ps(&b);
329 _NET(_mm_storeu_ps(a,_mm_mul_ps(*_p,_b));,
330 _mm_storeu_ps(a+4,_mm_mul_ps(*(_p+1),_b));)
339 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
340 _mm_storeu_ps(a,*_p++); a+=4;)
341 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
342 _mm_storeu_ps(a,*_p++); a+=4;)
343 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
344 _mm_storeu_ps(a,*_p++); a+=4;)
345 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
346 _mm_storeu_ps(a,*_p++); a+=4;)
357 float c[4]; _mm_storeu_ps(c,_c);
361 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));,
362 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));)
363 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));,
364 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));)
365 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));,
366 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));)
367 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));,
368 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));)
372 static inline float _sse_nrg_ps(__m128* _u,
float c, __m128* _v,
float s, __m128* _a) {
377 __m128 _c = _mm_load1_ps(&c);
378 __m128 _s = _mm_load1_ps(&s);
380 _NET(_b = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(*_u,_c), _mm_mul_ps(*_v,_s)));
381 _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out = XSUM(x);,
382 _b = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(*(_u+1),_c), _mm_mul_ps(*(_v+1), _s)));
383 _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out+= YSUM(x);)
390 __m128 _c = _mm_load1_ps(&c);
391 __m128 _s = _mm_load1_ps(&s);
393 _a[0] = _mm_add_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));,
394 _a[1] = _mm_add_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1],_s)));
403 __m128 _c = _mm_load1_ps(&c);
404 __m128 _s = _mm_load1_ps(&s);
407 _a[0] = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));
408 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
409 _a[1] = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1], _s)));
410 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
416 static inline void _sse_rotp_ps(__m128* u,
float*
c, __m128*
v,
float*
s, __m128* a) {
419 a[0] = _mm_add_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
420 a[1] = _mm_add_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
424 static inline void _sse_rotm_ps(__m128* u,
float*
c, __m128*
v,
float*
s, __m128* a) {
427 a[0] = _mm_sub_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
428 a[1] = _mm_sub_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
432 static inline __m128
_sse_rotp_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
434 return _mm_add_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
437 static inline __m128
_sse_rotm_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
439 return _mm_sub_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
442 static inline void _sse_rot4p_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
449 _mm_storeu_ps(c,*_c);
450 _mm_storeu_ps(s,*_s);
455 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
456 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
459 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
460 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
463 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
464 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
467 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
468 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
472 static inline void _sse_rot4m_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
479 _mm_storeu_ps(c,*_c);
480 _mm_storeu_ps(s,*_s);
485 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
486 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
489 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
490 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
493 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
494 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
497 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
498 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
504 NETX(_p[0] = (__m128*) (p[0] + m[0][l]*n);,
505 _p[1] = (__m128*) (p[1] + m[1][l]*n);,
506 _p[2] = (__m128*) (p[2] + m[2][l]*n);,
507 _p[3] = (__m128*) (p[3] + m[3][l]*n);,
508 _p[4] = (__m128*) (p[4] + m[4][l]*n);,
509 _p[5] = (__m128*) (p[5] + m[5][l]*n);,
510 _p[6] = (__m128*) (p[6] + m[6][l]*n);,
511 _p[7] = (__m128*) (p[7] + m[7][l]*n);)
516 __m128 _q = _mm_setzero_ps();
517 NETX(_q = _mm_add_ps(_q, *_p[0]);,
518 _q = _mm_add_ps(_q, *_p[1]);,
519 _q = _mm_add_ps(_q, *_p[2]);,
520 _q = _mm_add_ps(_q, *_p[3]);,
521 _q = _mm_add_ps(_q, *_p[4]);,
522 _q = _mm_add_ps(_q, *_p[5]);,
523 _q = _mm_add_ps(_q, *_p[6]);,
524 _q = _mm_add_ps(_q, *_p[7]);)
528 static inline __m128
_sse_cut_ps(__m128* _pE, __m128** _pe, __m128 _Es, __m128 _cmp) {
529 NETX(_cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[0]++),_Es));,
530 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[1]++),_Es));,
531 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[2]++),_Es));,
532 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[3]++),_Es));,
533 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[4]++),_Es));,
534 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[5]++),_Es));,
535 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[6]++),_Es));,
536 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[7]++),_Es));)
543 NETX(*_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[0]++));,
544 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[1]++));,
545 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[2]++));,
546 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[3]++));,
547 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[4]++));,
548 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[5]++));,
549 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[6]++));,
550 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[7]++));
560 __m128 _o2 = _mm_setzero_ps();
562 _o1 = _mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]));,
563 _o2 = _mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]));
565 _o1 = _mm_max_ps(_o1,_o2); _mm_storeu_ps(x,_o1); out=x[0];
566 if(out<x[1]) out=x[1];
567 if(out<x[2]) out=x[2];
568 if(out<x[3]) out=x[3];
572 static inline void _sse_ort4_ps(__m128* _u, __m128* _v, __m128* _s, __m128* _c) {
576 static const __m128
sm = _mm_set1_ps(-0.
f);
577 static const __m128 _o = _mm_set1_ps(1.
e-24);
578 static const __m128 _0 = _mm_set1_ps(0.);
579 static const __m128 _1 = _mm_set1_ps(1.);
580 static const __m128 _2 = _mm_set1_ps(2.);
581 __m128 _n,_m,gI,gR,_p,_q;
584 _p = _mm_and_ps(_mm_cmpge_ps(gR,_0),_1);
585 _q = _mm_sub_ps(_1,_p);
586 _n = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gI,gI),_mm_mul_ps(gR,gR)));
587 gR = _mm_add_ps(_mm_andnot_ps(sm,gR),_mm_add_ps(_n,_o));
588 _n = _mm_add_ps(_mm_mul_ps(_2,_n),_o);
589 gI = _mm_div_ps(gI,_n);
590 _n = _mm_sqrt_ps(_mm_div_ps(gR,_n));
591 _m = _mm_and_ps(_mm_cmpge_ps(gI,_0),_1);
592 _m = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_m,_2),_1),_n);
593 *_s = _mm_add_ps(_mm_mul_ps(_q,_m),_mm_mul_ps(_p,_mm_div_ps(gI,_n)));
594 gI = _mm_andnot_ps(sm,gI);
595 *_c = _mm_add_ps(_mm_mul_ps(_p,_n),_mm_mul_ps(_q,_mm_div_ps(gI,_n)));
610 static const __m128
sm = _mm_set1_ps(-0.
f);
611 static const __m128 _0 = _mm_set1_ps(0.);
612 static const __m128 _5 = _mm_set1_ps(0.5);
613 static const __m128 _1 = _mm_set1_ps(1.);
614 static const __m128 _2 = _mm_set1_ps(2.);
615 __m128 _n,_m,_C,_S,_p,_q;
616 _r = _mm_mul_ps(_mm_add_ps(_1,_r),_mm_set1_ps(1.
e-6));
617 _m = _mm_and_ps(_mm_cmpge_ps(*_s,_0),_1);
618 _m = _mm_sub_ps(_mm_mul_ps(_m,_2),_1);
619 _p = _mm_and_ps(_mm_cmpge_ps(*_c,_0),_1);
620 _q = _mm_sub_ps(_1,_p);
621 _C = _mm_add_ps(_mm_andnot_ps(sm,*_c),_r);
622 _n = _mm_add_ps(_mm_mul_ps(*_s,*_s),_mm_mul_ps(_C,_C));
623 _n = _mm_div_ps(_1,_mm_mul_ps(_mm_sqrt_ps(_n),_2));
624 _C = _mm_sqrt_ps(_mm_add_ps(_5,_mm_mul_ps(_C,_n)));
625 _S = _mm_div_ps(_mm_mul_ps(*_s,_n),_C);
626 *_s = _mm_add_ps(_mm_mul_ps(_p,_S),_mm_mul_ps(_q,_mm_mul_ps(_C,_m)));
627 *_c = _mm_add_ps(_mm_mul_ps(_p,_C),_mm_mul_ps(_q,_mm_mul_ps(_S,_m)));
631 static inline void _sse_dpf4_ps(__m128* _Fp, __m128* _Fx, __m128* _fp, __m128* _fx) {
641 static inline void _sse_pnp4_ps(__m128* _fp, __m128* _fx, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
648 static const __m128 _o = _mm_set1_ps(1.
e-24);
649 static const __m128 _1 = _mm_set1_ps(1.0);
650 __m128 gp = _mm_div_ps(_1,_mm_add_ps(
_sse_dot4_ps(_fp,_fp),_o));
662 static inline void _sse_dsp4_ps(__m128* u, __m128* v, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
687 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0] = XSUM(x);,
688 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0]+= YSUM(x);
691 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1] = XSUM(x);,
692 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1]+= YSUM(x);
695 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2] = XSUM(x);,
696 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2]+= YSUM(x);
699 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3] = XSUM(x);,
700 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3]+= YSUM(x);
702 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
716 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
717 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
720 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
721 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
724 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
725 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
728 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
729 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
731 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
745 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0] = XSUM(x);,
746 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0]+= YSUM(x);
749 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1] = XSUM(x);,
750 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1]+= YSUM(x);
753 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2] = XSUM(x);,
754 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2]+= YSUM(x);
757 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3] = XSUM(x);,
758 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3]+= YSUM(x);
760 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
767 __m128 _sm = _mm_set1_ps(-0.
f);
772 _pi = _mm_sub_ps(_pi,_qi);
773 _pe = _mm_sub_ps(_mm_sub_ps(_pe,_qe),_pi);
774 return _mm_add_ps(_mm_andnot_ps(_sm,_pi),_mm_andnot_ps(_sm,_pe));
791 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
792 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
795 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
796 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
799 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
800 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
803 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
804 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
806 return _mm_sub_ps(_L,_mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12))));
819 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
820 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
823 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
824 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
827 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
828 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
831 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
832 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
834 _c = _mm_add_ps(_mm_mul_ps(_L,_L),_mm_set1_ps(1.
e-16));
835 return _mm_div_ps(_mm_load_ps(o),_c);
850 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
851 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
854 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
855 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
858 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
859 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
862 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
863 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
867 _c = _mm_add_ps(_mm_set1_ps(1.
e-12),_mm_mul_ps(_c,_c));
868 _c = _mm_div_ps(_mm_load_ps(o),_c);
870 return _mm_mul_ps(
_sse_abs4_ps(_q),_mm_sub_ps(_mm_set1_ps(1.),_c));
873 static inline __m128
_sse_ed4_ps(__m128* _p, __m128* _q, __m128 _L) {
884 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
885 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
886 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
887 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
890 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
891 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
892 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
893 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
896 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
897 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
898 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
899 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
902 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
903 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
904 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
905 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
907 _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5));
908 return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
922 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
923 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
924 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
925 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
928 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
929 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
930 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
931 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
934 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
935 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
936 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
937 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
940 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
941 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
942 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
943 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
946 _aa = _mm_add_ps(_mm_set1_ps(1.
e-12),_mm_mul_ps(_aa,_aa));
949 return _mm_mul_ps(_aa,_mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5)));
963 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
964 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
965 _a++; _b++; o[0] = XSUM(x);,
966 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
967 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
968 _a++; _b++; o[0]+= YSUM(x);
971 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
972 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
973 _a++; _b++; o[1] = XSUM(x);,
974 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
975 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
976 _a++; _b++; o[1]+= YSUM(x);
979 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
980 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
981 _a++; _b++; o[2] = XSUM(x);,
982 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
983 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
984 _a++; _b++; o[2]+= YSUM(x);
987 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
988 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
989 _a++; _b++; o[3] = XSUM(x);,
990 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
991 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
992 _a++; _b++; o[3]+= YSUM(x);
994 _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(2.));
995 return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
1004 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1005 return _mm_div_ps(xx,_mm_add_ps(
_sse_dot4_ps(_f,_f),_mm_set1_ps(1.
e-12)));
1008 static inline __m128
_sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1017 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-12));
1019 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1020 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1021 return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));
1024 static inline __m128
_sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1032 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-12));
1034 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1035 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1036 return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));
1039 static inline __m128
_sse_like4w_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1047 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-9));
1048 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1049 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1050 return _mm_div_ps(_mm_add_ps(xp,xx),gp);
1058 static inline __m128
_sse_reg4x_ps(__m128 _L, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1065 static const __m128 _o = _mm_set1_ps(1.
e-12);
1069 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1070 xx = _mm_div_ps(_mm_mul_ps(xx,_D),_mm_mul_ps(_L,FF));
1071 return _mm_div_ps(_mm_sub_ps(FF,xx),_mm_add_ps(FF,_D));
1082 _ll = _mm_add_ps(_mm_add_ps(_ll,_LL),_mm_set1_ps(1.
e-12));
1083 return _mm_div_ps(_mm_add_ps(_ei,_EI),_ll);
1086 static inline void _sse_pol4_ps(__m128* _fp, __m128* _fx, __m128* _v,
double*
r,
double* a) {
1094 __m128 _oo = _mm_set1_ps(1.
e-12);
1095 float rpol[4],cpol[4],spol[4];
1098 _cc = _mm_add_ps(_mm_sqrt_ps(_cc),_oo);
1100 _mm_storeu_ps(cpol,_cc);
1103 _ss = _mm_add_ps(_mm_sqrt_ps(_ss),_oo);
1105 _mm_storeu_ps(spol,_ss);
1109 for(
int n=0;n<4;n++) {
1110 r[
n] = sqrt(rpol[n]);
1111 a[
n] = atan2(spol[n],cpol[n]);
static float _sse_abs_ps(__m128 *_a)
static float _sse_dot_ps(__m128 *_a, __m128 *_b)
static void _sse_hard4_ps(__m128 *_uu, __m128 *_am, __m128 *_AM, __m128 _c)
static __m128 _sse_rnorm4_ps(__m128 *_p)
static __m128 _sse_reg4x_ps(__m128 _L, __m128 *fx, __m128 *am, __m128 *AM, __m128 _D)
wavearray< double > a(hp.size())
static void _sse_add4_ps(__m128 *_a, __m128 *_b, __m128 _c)
static __m128 _sse_abs4_ps(__m128 *_p)
static void _sse_zero_ps(__m128 *_p)
static __m128 _sse_ed4_ps(__m128 *_p, __m128 *_q, __m128 _L)
static void _sse_dsp4_ps(__m128 *u, __m128 *v, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
static __m128 _sse_ei4xu_ps(__m128 *_x, __m128 *_u, __m128 _L)
static __m128 _sse_dot4_ps(__m128 *_p, __m128 *_q)
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\ layers : "<< nLAYERS<< "\ dF(hz) : "<< dF<< "\ dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1) *itime+ifreq;double time=itime *dT;double freq=(ifreq >0) ? ifreq *dF :dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static void _sse_cpf4_ps(__m128 *_aa, __m128 *_pp)
static float _sse_rotsub_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
static __m128 _sse_nind4_ps(__m128 *_am, __m128 *_AM)
static void _sse_cpf_ps(float *a, __m128 *_p)
static void _sse_rotm_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
static void _sse_rotadd_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
cout<< "SNR "<< xsnr<< endl;wavearray< double > f
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _sse_nrg_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
static __m128 _sse_ecoh4_ps(__m128 *_p, __m128 *_q, __m128 _L)
gwavearray< double > * gx
static void _sse_ifcp4_ps(__m128 *_aa, __m128 *_bb, __m128 _c)
static __m128 _sse_ind4_ps(__m128 *_p, __m128 _L)
printf("total live time: non-zero lags = %10.1f \, liveTot)
static void _sse_pol4_ps(__m128 *_fp, __m128 *_fx, __m128 *_v, double *r, double *a)
static __m128 _sse_like4w_ps(__m128 *fp, __m128 *fx, __m128 *am, __m128 *AM)
static void _sse_rot4m_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
static void _sse_load_ps(__m128 *_p, float a)
static void _sse_add_ps(__m128 *_a, __m128 *_b)
static void _sse_mul4_ps(__m128 *_am, __m128 _c)
static __m128 _sse_ei4_ps(__m128 *_u, __m128 _L)
static void _sse_print_ps(__m128 *_p)
static __m128 _sse_ed4i_ps(__m128 *_p, __m128 *_q, __m128 _L)
static __m128 _sse_sum_ps(__m128 **_p)
static void _sse_point_ps(__m128 **_p, float **p, short **m, int l, int n)
static void _sse_mul_ps(__m128 *_a, float b)
static __m128 _sse_div4_ps(__m128 *_v, __m128 *_u)
static __m128 _sse_null4_ps(__m128 *_p, __m128 *_q)
static float _sse_maxE_ps(__m128 *_a, __m128 *_A)
static void _sse_minSNE_ps(__m128 *_pE, __m128 **_pe, __m128 *_es)
static __m128 _sse_like4_ps(__m128 *_f, __m128 *_a, __m128 *_A)
static void _sse_rot4p_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
static void _sse_rotp_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
static void _sse_sub4_ps(__m128 *_a, __m128 *_b, __m128 _c)
static void _sse_dpf4_ps(__m128 *_Fp, __m128 *_Fx, __m128 *_fp, __m128 *_fx)
static __m128 _sse_cut_ps(__m128 *_pE, __m128 **_pe, __m128 _Es, __m128 _cmp)
static void _sse_pnp4_ps(__m128 *_fp, __m128 *_fx, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
static __m128 _sse_ei4xx_ps(__m128 *_x, __m128 *_u, __m128 _L)
static void _sse_ort4_ps(__m128 *_u, __m128 *_v, __m128 *_s, __m128 *_c)
static void _sse_sub_ps(__m128 *_a, __m128 *_b)