26 #include <xmmintrin.h> 27 #include <pmmintrin.h> 28 #include <immintrin.h> 41 for(
int n=0;
n<v.size();
n++) _mm_free(v[
n]);
42 v.clear(); std::vector<float*>().swap(v);
45 float x[4]; _mm_storeu_ps(x,y);
46 return x[0]+x[1]+x[2]+x[3];
50 float** u,
float **
v,
int I) {
52 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
53 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
54 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
55 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
56 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
57 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
58 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
59 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
61 NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
62 __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
63 __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
64 __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
65 __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
66 __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
67 __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
68 __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
70 for(
int i=0;
i<I;
i+=4) {
71 NETX(*_u0++ = *_p0++; *_v0++ = *_q0++; ,
72 *_u1++ = *_p1++; *_v1++ = *_q1++; ,
73 *_u2++ = *_p2++; *_v2++ = *_q2++; ,
74 *_u3++ = *_p3++; *_v3++ = *_q3++; ,
75 *_u4++ = *_p4++; *_v4++ = *_q4++; ,
76 *_u5++ = *_p5++; *_v5++ = *_q5++; ,
77 *_u6++ = *_p6++; *_v6++ = *_q6++; ,
78 *_u7++ = *_p7++; *_v7++ = *_q7++; )
85 std::vector<float*> &pAPN,
86 std::vector<float*> &pAVX,
int I) {
94 __m128* _fp = (__m128*)pAVX[2];
95 __m128* _fx = (__m128*)pAVX[3];
96 __m128* _si = (__m128*)pAVX[4];
97 __m128* _co = (__m128*)pAVX[5];
98 __m128* _ni = (__m128*)pAVX[18];
99 __m128 _ff,_FF,_fF,_AP,_cc,_ss,_nn,_cn,_sn;
100 static const __m128 _0 = _mm_set1_ps(0);
101 static const __m128 _1 = _mm_set1_ps(1);
102 static const __m128 _2 = _mm_set1_ps(2);
103 static const __m128 _o = _mm_set1_ps(0.0001);
104 static const __m128
sm = _mm_set1_ps(-0.
f);
105 __m128 _NI= _mm_setzero_ps();
106 __m128 _NN= _mm_setzero_ps();
108 NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I); __m128* _r0=(__m128*)(pAPN[0]+2*I);,
109 __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I); __m128* _r1=(__m128*)(pAPN[1]+2*I);,
110 __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I); __m128* _r2=(__m128*)(pAPN[2]+2*I);,
111 __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I); __m128* _r3=(__m128*)(pAPN[3]+2*I);,
112 __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I); __m128* _r4=(__m128*)(pAPN[4]+2*I);,
113 __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I); __m128* _r5=(__m128*)(pAPN[5]+2*I);,
114 __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I); __m128* _r6=(__m128*)(pAPN[6]+2*I);,
115 __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I); __m128* _r7=(__m128*)(pAPN[7]+2*I);)
117 NETX(__m128 _Fp0 = _mm_set1_ps(*(Fp[0]+l)); __m128 _Fx0 = _mm_set1_ps(*(Fx[0]+l)); ,
118 __m128 _Fp1 = _mm_set1_ps(*(Fp[1]+l)); __m128 _Fx1 = _mm_set1_ps(*(Fx[1]+l)); ,
119 __m128 _Fp2 = _mm_set1_ps(*(Fp[2]+l)); __m128 _Fx2 = _mm_set1_ps(*(Fx[2]+l)); ,
120 __m128 _Fp3 = _mm_set1_ps(*(Fp[3]+l)); __m128 _Fx3 = _mm_set1_ps(*(Fx[3]+l)); ,
121 __m128 _Fp4 = _mm_set1_ps(*(Fp[4]+l)); __m128 _Fx4 = _mm_set1_ps(*(Fx[4]+l)); ,
122 __m128 _Fp5 = _mm_set1_ps(*(Fp[5]+l)); __m128 _Fx5 = _mm_set1_ps(*(Fx[5]+l)); ,
123 __m128 _Fp6 = _mm_set1_ps(*(Fp[6]+l)); __m128 _Fx6 = _mm_set1_ps(*(Fx[6]+l)); ,
124 __m128 _Fp7 = _mm_set1_ps(*(Fp[7]+l)); __m128 _Fx7 = _mm_set1_ps(*(Fx[7]+l)); )
127 NETX(
if(*(pAPN[0]+2*I)>0.) sign+=*(Fp[0]+l) * *(Fx[0]+l);,
128 if(*(pAPN[1]+2*I)>0.) sign+=*(Fp[1]+l) * *(Fx[1]+l);,
129 if(*(pAPN[2]+2*I)>0.) sign+=*(Fp[2]+l) * *(Fx[2]+l);,
130 if(*(pAPN[3]+2*I)>0.) sign+=*(Fp[3]+l) * *(Fx[3]+l);,
131 if(*(pAPN[4]+2*I)>0.) sign+=*(Fp[4]+l) * *(Fx[4]+l);,
132 if(*(pAPN[5]+2*I)>0.) sign+=*(Fp[5]+l) * *(Fx[5]+l);,
133 if(*(pAPN[6]+2*I)>0.) sign+=*(Fp[6]+l) * *(Fx[6]+l);,
134 if(*(pAPN[7]+2*I)>0.) sign+=*(Fp[7]+l) * *(Fx[7]+l);)
135 sign = sign>0 ? 1. : -1.;
136 __m128 _sign = _mm_set1_ps(sign);
138 for(
int i=0;
i<I;
i+=4) {
140 *_f0 = _mm_mul_ps(*_r0,_Fp0); *_F0 = _mm_mul_ps(*_r0++,_Fx0);
141 _ff = _mm_mul_ps(*_f0,*_f0);
142 _FF = _mm_mul_ps(*_F0,*_F0);
143 _fF = _mm_mul_ps(*_f0,*_F0); ,
145 *_f1 = _mm_mul_ps(*_r1,_Fp1); *_F1 = _mm_mul_ps(*_r1++,_Fx1);
146 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f1,*_f1));
147 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F1,*_F1));
148 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f1,*_F1)); ,
150 *_f2 = _mm_mul_ps(*_r2,_Fp2); *_F2 = _mm_mul_ps(*_r2++,_Fx2);
151 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f2,*_f2));
152 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F2,*_F2));
153 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f2,*_F2)); ,
155 *_f3 = _mm_mul_ps(*_r3,_Fp3); *_F3 = _mm_mul_ps(*_r3++,_Fx3);
156 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f3,*_f3));
157 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F3,*_F3));
158 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f3,*_F3)); ,
160 *_f4 = _mm_mul_ps(*_r4,_Fp4); *_F4 = _mm_mul_ps(*_r4++,_Fx4);
161 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f4,*_f4));
162 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F4,*_F4));
163 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f4,*_F4)); ,
165 *_f5 = _mm_mul_ps(*_r5,_Fp5); *_F5 = _mm_mul_ps(*_r5++,_Fx5);
166 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f5,*_f5));
167 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F5,*_F5));
168 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f5,*_F5)); ,
170 *_f6 = _mm_mul_ps(*_r6,_Fp6); *_F6 = _mm_mul_ps(*_r6++,_Fx6);
171 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f6,*_f6));
172 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F6,*_F6));
173 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f6,*_F6)); ,
175 *_f7 = _mm_mul_ps(*_r7,_Fp7); *_F7 = _mm_mul_ps(*_r7++,_Fx7);
176 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f7,*_f7));
177 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F7,*_F7));
178 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f7,*_F7)); )
180 *_si = _mm_mul_ps(_fF,_2);
181 *_co = _mm_sub_ps(_ff,_FF);
182 _AP = _mm_add_ps(_ff,_FF);
183 _cc = _mm_mul_ps(*_co,*_co);
184 _ss = _mm_mul_ps(*_si,*_si);
185 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
186 *_fp = _mm_div_ps(_mm_add_ps(_AP,_nn),_2);
187 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
188 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
189 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
190 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
191 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
192 *_co = _mm_mul_ps(*_co,_ss);
196 _ff = _mm_add_ps(_mm_mul_ps(*_f0,*_co),_mm_mul_ps(*_F0,*_si)); _cc = _mm_mul_ps(_ff,_ff);
197 _FF = _mm_sub_ps(_mm_mul_ps(*_F0,*_co),_mm_mul_ps(*_f0,*_si)); *_f0 = _ff; *_F0 = _FF;
198 _nn = _mm_mul_ps(_cc,_cc); _fF = _mm_mul_ps(_ff,_FF);,
200 _ff = _mm_add_ps(_mm_mul_ps(*_f1,*_co),_mm_mul_ps(*_F1,*_si)); _cc = _mm_mul_ps(_ff,_ff);
201 _FF = _mm_sub_ps(_mm_mul_ps(*_F1,*_co),_mm_mul_ps(*_f1,*_si)); *_f1 = _ff; *_F1 = _FF;
202 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
204 _ff = _mm_add_ps(_mm_mul_ps(*_f2,*_co),_mm_mul_ps(*_F2,*_si)); _cc = _mm_mul_ps(_ff,_ff);
205 _FF = _mm_sub_ps(_mm_mul_ps(*_F2,*_co),_mm_mul_ps(*_f2,*_si)); *_f2 = _ff; *_F2 = _FF;
206 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
208 _ff = _mm_add_ps(_mm_mul_ps(*_f3,*_co),_mm_mul_ps(*_F3,*_si)); _cc = _mm_mul_ps(_ff,_ff);
209 _FF = _mm_sub_ps(_mm_mul_ps(*_F3,*_co),_mm_mul_ps(*_f3,*_si)); *_f3 = _ff; *_F3 = _FF;
210 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
212 _ff = _mm_add_ps(_mm_mul_ps(*_f4,*_co),_mm_mul_ps(*_F4,*_si)); _cc = _mm_mul_ps(_ff,_ff);
213 _FF = _mm_sub_ps(_mm_mul_ps(*_F4,*_co),_mm_mul_ps(*_f4,*_si)); *_f4 = _ff; *_F4 = _FF;
214 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
216 _ff = _mm_add_ps(_mm_mul_ps(*_f5,*_co),_mm_mul_ps(*_F5,*_si)); _cc = _mm_mul_ps(_ff,_ff);
217 _FF = _mm_sub_ps(_mm_mul_ps(*_F5,*_co),_mm_mul_ps(*_f5,*_si)); *_f5 = _ff; *_F5 = _FF;
218 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
220 _ff = _mm_add_ps(_mm_mul_ps(*_f6,*_co),_mm_mul_ps(*_F6,*_si)); _cc = _mm_mul_ps(_ff,_ff);
221 _FF = _mm_sub_ps(_mm_mul_ps(*_F6,*_co),_mm_mul_ps(*_f6,*_si)); *_f6 = _ff; *_F6 = _FF;
222 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
224 _ff = _mm_add_ps(_mm_mul_ps(*_f7,*_co),_mm_mul_ps(*_F7,*_si)); _cc = _mm_mul_ps(_ff,_ff);
225 _FF = _mm_sub_ps(_mm_mul_ps(*_F7,*_co),_mm_mul_ps(*_f7,*_si)); *_f7 = _ff; *_F7 = _FF;
226 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));)
228 _fF = _mm_div_ps(_fF,_mm_add_ps(*_fp,_o));
231 *_F0 = _mm_sub_ps(*_F0,_mm_mul_ps(*_f0++,_fF)); *_fx = _mm_mul_ps(*_F0,*_F0); _F0++;,
232 *_F1 = _mm_sub_ps(*_F1,_mm_mul_ps(*_f1++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F1,*_F1)); _F1++;,
233 *_F2 = _mm_sub_ps(*_F2,_mm_mul_ps(*_f2++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F2,*_F2)); _F2++;,
234 *_F3 = _mm_sub_ps(*_F3,_mm_mul_ps(*_f3++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F3,*_F3)); _F3++;,
235 *_F4 = _mm_sub_ps(*_F4,_mm_mul_ps(*_f4++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F4,*_F4)); _F4++;,
236 *_F5 = _mm_sub_ps(*_F5,_mm_mul_ps(*_f5++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F5,*_F5)); _F5++;,
237 *_F6 = _mm_sub_ps(*_F6,_mm_mul_ps(*_f6++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F6,*_F6)); _F6++;,
238 *_F7 = _mm_sub_ps(*_F7,_mm_mul_ps(*_f7++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F7,*_F7)); _F7++;)
240 *_ni = _mm_div_ps(_nn,_mm_add_ps(_mm_mul_ps(*_fp,*_fp),_o));
241 _ff = _mm_add_ps(_mm_mul_ps(_1,*_ni),_o);
242 _NI = _mm_add_ps(_NI,(_mm_div_ps(*_fx,_ff)));
243 _NN = _mm_add_ps(_NN,_mm_and_ps(_mm_cmpgt_ps(*_fp,_0),_1));
245 _fp++; _fx++; _si++; _co++; _ni++;
252 std::vector<float*> &pAPN,
float* rr,
253 std::vector<float*> &pAVX,
int II) {
263 __m128* _et = (__m128*)pAVX[0];
264 __m128* _mk = (__m128*)pAVX[1];
265 __m128* _fp = (__m128*)pAVX[2];
266 __m128* _fx = (__m128*)pAVX[3];
267 __m128* _au = (__m128*)pAVX[10];
268 __m128* _AU = (__m128*)pAVX[11];
269 __m128* _av = (__m128*)pAVX[12];
270 __m128* _AV = (__m128*)pAVX[13];
271 __m128* _ni = (__m128*)pAVX[18];
273 __m128 _uu = _mm_setzero_ps();
274 __m128 _UU = _mm_setzero_ps();
275 __m128 _uU = _mm_setzero_ps();
276 __m128 _vv = _mm_setzero_ps();
277 __m128 _VV = _mm_setzero_ps();
278 __m128 _vV = _mm_setzero_ps();
279 __m128 _NN = _mm_setzero_ps();
281 __m128 _h,_H,_f,_F,_R,_a,_nn,_m,_ff,_xp,_XP,_xx,_XX;
282 float cu,su,cv,sv,cc,
ss,nn,uu,UU,vv,VV,et,ET;
283 static const __m128 _0 = _mm_set1_ps(0);
284 static const __m128 _1 = _mm_set1_ps(1);
285 static const __m128 o1 = _mm_set1_ps(0.1);
286 static const __m128 _o = _mm_set1_ps(1.
e-5);
287 __m128 _rr = _mm_set1_ps(rr[0]);
288 __m128 _RR = _mm_set1_ps(rr[1]);
291 NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
292 __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
293 __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
294 __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
295 __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
296 __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
297 __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
298 __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
301 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
302 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
303 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
304 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
305 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
306 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
307 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
308 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
310 for(
int i=0;
i<I;
i+=4) {
313 _xp = _mm_mul_ps(*_f0,*_p0);
314 _XP = _mm_mul_ps(*_f0,*_q0);
315 _xx = _mm_mul_ps(*_F0,*_p0);
316 _XX = _mm_mul_ps(*_F0,*_q0); ,
318 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,*_p1));
319 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,*_q1));
320 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,*_p1));
321 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,*_q1)); ,
323 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,*_p2));
324 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,*_q2));
325 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,*_p2));
326 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,*_q2)); ,
328 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,*_p3));
329 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,*_q3));
330 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,*_p3));
331 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,*_q3)); ,
333 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,*_p4));
334 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,*_q4));
335 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,*_p4));
336 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,*_q4)); ,
338 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,*_p5));
339 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,*_q5));
340 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,*_p5));
341 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,*_q5)); ,
343 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,*_p6));
344 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,*_q6));
345 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,*_p6));
346 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,*_q6)); ,
348 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,*_p7));
349 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,*_q7));
350 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,*_p7));
351 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,*_q7)); )
355 _f = _mm_add_ps(_mm_mul_ps(_xp,_xp),_mm_mul_ps(_XP,_XP));
356 _f = _mm_div_ps(_f,_mm_add_ps(*_et,_o));
357 _f = _mm_sqrt_ps(_mm_mul_ps(*_ni,_f));
358 _f = _mm_sub_ps(_mm_mul_ps(_f,_rr),*_fp);
359 _f = _mm_mul_ps(_f,_mm_and_ps(_mm_cmpgt_ps(_f,_0),_1));
360 _f = _mm_div_ps(*_mk,_mm_add_ps(*_fp,_mm_add_ps(_o,_f)));
362 _h = _mm_mul_ps(_xp,_f);
363 _H = _mm_mul_ps(_XP,_f);
364 _h = _mm_add_ps(_mm_mul_ps(_h,_h),_mm_mul_ps(_H,_H));
365 _H = _mm_add_ps(_mm_mul_ps(_xx,_xx),_mm_mul_ps(_XX,_XX));
366 _F = _mm_sqrt_ps(_mm_div_ps(_H,_mm_add_ps(_h,_o)));
367 _R = _mm_add_ps(o1,_mm_div_ps(_RR,_mm_add_ps(*_et,_o)));
368 _F = _mm_sub_ps(_mm_mul_ps(_F,_R),*_fx);
369 _F = _mm_mul_ps(_F,_mm_and_ps(_mm_cmpgt_ps(_F,_0),_1));
370 _F = _mm_div_ps(*_mk,_mm_add_ps(*_fx,_mm_add_ps(_o,_F)));
372 *_au = _mm_mul_ps(_xp,_f);
373 *_AU = _mm_mul_ps(_XP,_f);
374 *_av = _mm_mul_ps(_xx,_F);
375 *_AV = _mm_mul_ps(_XX,_F);
377 _a = _mm_add_ps(_mm_mul_ps(_f,*_fp),_mm_mul_ps(_F,*_fx));
378 _NN = _mm_add_ps(_NN,*_mk);
379 *_mk = _mm_add_ps(_a,_mm_sub_ps(*_mk,_1));
381 _mk++; _et++; _fp++; _fx++; _ni++;
384 NETX(*_p0++ = _mm_add_ps(_mm_mul_ps(*_f0, *_au),_mm_mul_ps(*_F0 ,*_av));
385 *_q0++ = _mm_add_ps(_mm_mul_ps(*_f0++,*_AU),_mm_mul_ps(*_F0++,*_AV)); ,
386 *_p1++ = _mm_add_ps(_mm_mul_ps(*_f1 ,*_au),_mm_mul_ps(*_F1 ,*_av));
387 *_q1++ = _mm_add_ps(_mm_mul_ps(*_f1++,*_AU),_mm_mul_ps(*_F1++,*_AV)); ,
388 *_p2++ = _mm_add_ps(_mm_mul_ps(*_f2 ,*_au),_mm_mul_ps(*_F2 ,*_av));
389 *_q2++ = _mm_add_ps(_mm_mul_ps(*_f2++,*_AU),_mm_mul_ps(*_F2++,*_AV)); ,
390 *_p3++ = _mm_add_ps(_mm_mul_ps(*_f3 ,*_au),_mm_mul_ps(*_F3 ,*_av));
391 *_q3++ = _mm_add_ps(_mm_mul_ps(*_f3++,*_AU),_mm_mul_ps(*_F3++,*_AV)); ,
392 *_p4++ = _mm_add_ps(_mm_mul_ps(*_f4 ,*_au),_mm_mul_ps(*_F4 ,*_av));
393 *_q4++ = _mm_add_ps(_mm_mul_ps(*_f4++,*_AU),_mm_mul_ps(*_F4++,*_AV)); ,
394 *_p5++ = _mm_add_ps(_mm_mul_ps(*_f5 ,*_au),_mm_mul_ps(*_F5 ,*_av));
395 *_q5++ = _mm_add_ps(_mm_mul_ps(*_f5++,*_AU),_mm_mul_ps(*_F5++,*_AV)); ,
396 *_p6++ = _mm_add_ps(_mm_mul_ps(*_f6 ,*_au),_mm_mul_ps(*_F6 ,*_av));
397 *_q6++ = _mm_add_ps(_mm_mul_ps(*_f6++,*_AU),_mm_mul_ps(*_F6++,*_AV)); ,
398 *_p7++ = _mm_add_ps(_mm_mul_ps(*_f7 ,*_au),_mm_mul_ps(*_F7 ,*_av));
399 *_q7++ = _mm_add_ps(_mm_mul_ps(*_f7++,*_AU),_mm_mul_ps(*_F7++,*_AV)); )
401 _uU = _mm_add_ps(_uU,_mm_mul_ps(*_au,*_AU));
402 _vV = _mm_add_ps(_vV,_mm_mul_ps(*_av,*_AV));
403 _uu = _mm_add_ps(_uu,_mm_mul_ps(*_au,*_au)); _au++;
404 _UU = _mm_add_ps(_UU,_mm_mul_ps(*_AU,*_AU)); _AU++;
405 _vv = _mm_add_ps(_vv,_mm_mul_ps(*_av,*_av)); _av++;
406 _VV = _mm_add_ps(_VV,_mm_mul_ps(*_AV,*_AV)); _AV++;
413 nn = sqrt((uu-UU)*(uu-UU)+4*su*su)+0.0001;
414 cu = (uu-UU)/nn; et=uu+UU;
415 uu = sqrt((et+nn)/2); UU = et>nn?sqrt((et-nn)/2):0;
417 su = sqrt((1-cu)/2); cu = nn*sqrt((1+cu)/2);
420 nn = sqrt((vv-VV)*(vv-VV)+4*sv*sv)+0.0001;
421 cv = (vv-VV)/nn; ET=vv+VV;
422 vv = sqrt((ET+nn)/2); VV = et>nn?sqrt((ET-nn)/2):0;
424 sv = sqrt((1-cv)/2); cv = nn*sqrt((1+cv)/2);
433 std::vector<float*> &pAPN,
434 std::vector<float*> &pAVX,
int II) {
446 __m128* _MK = (__m128*)pAVX[1];
447 __m128* _fp = (__m128*)pAVX[2];
448 __m128* _fx = (__m128*)pAVX[3];
450 __m128 _xp,_XP,_xx,_XX,_rr,_RR,_mk;
452 static const __m128 _0 = _mm_set1_ps(0);
453 static const __m128 _1 = _mm_set1_ps(1);
454 static const __m128 _o = _mm_set1_ps(1.
e-5);
456 __m128 _ss,_cc,_SS,_CC;
457 float rpol[4],cpol[4],spol[4];
458 float RPOL[4],CPOL[4],SPOL[4];
460 double*
r = pol00[0].
data;
461 double*
a = pol00[1].
data;
462 double* R = pol90[0].
data;
463 double*
A = pol90[1].
data;
466 NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
467 __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
468 __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
469 __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
470 __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
471 __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
472 __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
473 __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
476 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
477 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
478 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
479 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
480 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
481 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
482 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
483 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
486 for(
int i=0;
i<I;
i+=4) {
490 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1);
493 _xp = _mm_mul_ps(*_f0,_mm_mul_ps(_mk,*_p0));
494 _XP = _mm_mul_ps(*_f0,_mm_mul_ps(_mk,*_q0));
495 _xx = _mm_mul_ps(*_F0,_mm_mul_ps(_mk,*_p0));
496 _XX = _mm_mul_ps(*_F0,_mm_mul_ps(_mk,*_q0)); ,
498 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,_mm_mul_ps(_mk,*_p1)));
499 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,_mm_mul_ps(_mk,*_q1)));
500 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,_mm_mul_ps(_mk,*_p1)));
501 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,_mm_mul_ps(_mk,*_q1))); ,
503 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,_mm_mul_ps(_mk,*_p2)));
504 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,_mm_mul_ps(_mk,*_q2)));
505 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,_mm_mul_ps(_mk,*_p2)));
506 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,_mm_mul_ps(_mk,*_q2))); ,
508 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,_mm_mul_ps(_mk,*_p3)));
509 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,_mm_mul_ps(_mk,*_q3)));
510 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,_mm_mul_ps(_mk,*_p3)));
511 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,_mm_mul_ps(_mk,*_q3))); ,
513 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,_mm_mul_ps(_mk,*_p4)));
514 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,_mm_mul_ps(_mk,*_q4)));
515 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,_mm_mul_ps(_mk,*_p4)));
516 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,_mm_mul_ps(_mk,*_q4))); ,
518 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,_mm_mul_ps(_mk,*_p5)));
519 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,_mm_mul_ps(_mk,*_q5)));
520 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,_mm_mul_ps(_mk,*_p5)));
521 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,_mm_mul_ps(_mk,*_q5))); ,
523 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,_mm_mul_ps(_mk,*_p6)));
524 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,_mm_mul_ps(_mk,*_q6)));
525 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,_mm_mul_ps(_mk,*_p6)));
526 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,_mm_mul_ps(_mk,*_q6))); ,
528 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,_mm_mul_ps(_mk,*_p7)));
529 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,_mm_mul_ps(_mk,*_q7)));
530 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,_mm_mul_ps(_mk,*_p7)));
531 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,_mm_mul_ps(_mk,*_q7))); )
535 _cc = _mm_div_ps(_xp,_mm_add_ps(_mm_sqrt_ps(*_fp),_o));
536 _ss = _mm_div_ps(_xx,_mm_add_ps(_mm_sqrt_ps(*_fx),_o));
538 _rr = _mm_div_ps(_mm_mul_ps(_xp,_xp),_mm_add_ps(*_fp,_o));
539 _rr = _mm_add_ps(_rr,_mm_div_ps(_mm_mul_ps(_xx,_xx),_mm_add_ps(*_fx,_o)));
541 _mm_storeu_ps(cpol,_cc);
542 _mm_storeu_ps(spol,_ss);
543 _mm_storeu_ps(rpol,_rr);
545 _CC = _mm_div_ps(_XP,_mm_add_ps(_mm_sqrt_ps(*_fp),_o));
546 _SS = _mm_div_ps(_XX,_mm_add_ps(_mm_sqrt_ps(*_fx),_o));
548 _RR = _mm_div_ps(_mm_mul_ps(_XP,_XP),_mm_add_ps(*_fp,_o));
549 _RR = _mm_add_ps(_RR,_mm_div_ps(_mm_mul_ps(_XX,_XX),_mm_add_ps(*_fx,_o)));
551 _mm_storeu_ps(CPOL,_CC);
552 _mm_storeu_ps(SPOL,_SS);
553 _mm_storeu_ps(RPOL,_RR);
555 for(
int n=0;
n<4;
n++) {
556 r[
m] = sqrt(rpol[
n]);
557 a[
m] = atan2(spol[n],cpol[n]);
558 R[
m] = sqrt(RPOL[n]);
559 A[
m] = atan2(SPOL[n],CPOL[n]);
565 _cc = _mm_div_ps(_cc,_mm_add_ps(_mm_sqrt_ps(*_fp),_o));
566 _ss = _mm_div_ps(_ss,_mm_add_ps(_mm_sqrt_ps(*_fx),_o));
568 _CC = _mm_div_ps(_CC,_mm_add_ps(_mm_sqrt_ps(*_fp),_o));
569 _SS = _mm_div_ps(_SS,_mm_add_ps(_mm_sqrt_ps(*_fx),_o));
571 NETX(*_p0 = _mm_add_ps(_mm_mul_ps(*_f0,_cc),_mm_mul_ps(*_F0,_ss));
572 *_q0 = _mm_add_ps(_mm_mul_ps(*_f0,_CC),_mm_mul_ps(*_F0,_SS)); ,
573 *_p1 = _mm_add_ps(_mm_mul_ps(*_f1,_cc),_mm_mul_ps(*_F1,_ss));
574 *_q1 = _mm_add_ps(_mm_mul_ps(*_f1,_CC),_mm_mul_ps(*_F1,_SS)); ,
575 *_p2 = _mm_add_ps(_mm_mul_ps(*_f2,_cc),_mm_mul_ps(*_F2,_ss));
576 *_q2 = _mm_add_ps(_mm_mul_ps(*_f2,_CC),_mm_mul_ps(*_F2,_SS)); ,
577 *_p3 = _mm_add_ps(_mm_mul_ps(*_f3,_cc),_mm_mul_ps(*_F3,_ss));
578 *_q3 = _mm_add_ps(_mm_mul_ps(*_f3,_CC),_mm_mul_ps(*_F3,_SS)); ,
579 *_p4 = _mm_add_ps(_mm_mul_ps(*_f4,_cc),_mm_mul_ps(*_F4,_ss));
580 *_q4 = _mm_add_ps(_mm_mul_ps(*_f4,_CC),_mm_mul_ps(*_F4,_SS)); ,
581 *_p5 = _mm_add_ps(_mm_mul_ps(*_f5,_cc),_mm_mul_ps(*_F5,_ss));
582 *_q5 = _mm_add_ps(_mm_mul_ps(*_f5,_CC),_mm_mul_ps(*_F5,_SS)); ,
583 *_p6 = _mm_add_ps(_mm_mul_ps(*_f6,_cc),_mm_mul_ps(*_F6,_ss));
584 *_q6 = _mm_add_ps(_mm_mul_ps(*_f6,_CC),_mm_mul_ps(*_F6,_SS)); ,
585 *_p7 = _mm_add_ps(_mm_mul_ps(*_f7,_cc),_mm_mul_ps(*_F7,_ss));
586 *_q7 = _mm_add_ps(_mm_mul_ps(*_f7,_CC),_mm_mul_ps(*_F7,_SS)); )
590 __m128 _N = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(_cc,_cc),_mm_mul_ps(_CC,_CC)));
592 _cc = _mm_div_ps(_cc,_mm_add_ps(_N,_o));
593 _CC = _mm_div_ps(_CC,_mm_add_ps(_N,_o));
597 NETX(_y = _mm_add_ps(_mm_mul_ps(*_p0,_cc),_mm_mul_ps(*_q0,_CC));
598 _Y = _mm_sub_ps(_mm_mul_ps(*_q0,_cc),_mm_mul_ps(*_p0,_CC)); *_p0 = _y; *_q0 = _Y; ,
599 _y = _mm_add_ps(_mm_mul_ps(*_p1,_cc),_mm_mul_ps(*_q1,_CC));
600 _Y = _mm_sub_ps(_mm_mul_ps(*_q1,_cc),_mm_mul_ps(*_p1,_CC)); *_p1 = _y; *_q1 = _Y; ,
601 _y = _mm_add_ps(_mm_mul_ps(*_p2,_cc),_mm_mul_ps(*_q2,_CC));
602 _Y = _mm_sub_ps(_mm_mul_ps(*_q2,_cc),_mm_mul_ps(*_p2,_CC)); *_p2 = _y; *_q2 = _Y; ,
603 _y = _mm_add_ps(_mm_mul_ps(*_p3,_cc),_mm_mul_ps(*_q3,_CC));
604 _Y = _mm_sub_ps(_mm_mul_ps(*_q3,_cc),_mm_mul_ps(*_p3,_CC)); *_p3 = _y; *_q3 = _Y; ,
605 _y = _mm_add_ps(_mm_mul_ps(*_p4,_cc),_mm_mul_ps(*_q4,_CC));
606 _Y = _mm_sub_ps(_mm_mul_ps(*_q4,_cc),_mm_mul_ps(*_p4,_CC)); *_p4 = _y; *_q4 = _Y; ,
607 _y = _mm_add_ps(_mm_mul_ps(*_p5,_cc),_mm_mul_ps(*_q5,_CC));
608 _Y = _mm_sub_ps(_mm_mul_ps(*_q5,_cc),_mm_mul_ps(*_p5,_CC)); *_p5 = _y; *_q5 = _Y; ,
609 _y = _mm_add_ps(_mm_mul_ps(*_p6,_cc),_mm_mul_ps(*_q6,_CC));
610 _Y = _mm_sub_ps(_mm_mul_ps(*_q6,_cc),_mm_mul_ps(*_p6,_CC)); *_p6 = _y; *_q6 = _Y; ,
611 _y = _mm_add_ps(_mm_mul_ps(*_p7,_cc),_mm_mul_ps(*_q7,_CC));
612 _Y = _mm_sub_ps(_mm_mul_ps(*_q7,_cc),_mm_mul_ps(*_p7,_CC)); *_p7 = _y; *_q7 = _Y; )
618 _p0++;_q0++;_f0++;_F0++; ,
619 _p1++;_q1++;_f1++;_F1++; ,
620 _p2++;_q2++;_f2++;_F2++; ,
621 _p3++;_q3++;_f3++;_F3++; ,
622 _p4++;_q4++;_f4++;_F4++; ,
623 _p5++;_q5++;_f5++;_F5++; ,
624 _p6++;_q6++;_f6++;_F6++; ,
625 _p7++;_q7++;_f7++;_F7++; )
635 float** u,
float**
v,
float En,
636 std::vector<float*> &pAVX,
int I) {
643 __m128 _aa, _AA, _aA;
645 static const __m128 _1 = _mm_set1_ps(1);
646 static const __m128 _o = _mm_set1_ps(1.
e-12);
647 static const __m128
sm = _mm_set1_ps(-0.
f);
648 static const __m128 _En= _mm_set1_ps(En);
650 __m128* _et = (__m128*)pAVX[0];
651 __m128* _mk = (__m128*)pAVX[1];
652 __m128 _ee = _mm_setzero_ps();
653 __m128 _EE = _mm_setzero_ps();
654 __m128 _NN = _mm_setzero_ps();
656 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
657 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
658 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
659 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
660 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
661 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
662 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
663 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
665 NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
666 __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
667 __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
668 __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
669 __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
670 __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
671 __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
672 __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
674 for(
int i=0;
i<I;
i+=4) {
676 _aa = _mm_mul_ps(*_p0,*_p0); *_u0++ = *_p0++;
677 _AA = _mm_mul_ps(*_q0,*_q0); *_v0++ = *_q0++; ,
678 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1)); *_u1++ = *_p1++;
679 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1)); *_v1++ = *_q1++; ,
680 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2)); *_u2++ = *_p2++;
681 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2)); *_v2++ = *_q2++; ,
682 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3)); *_u3++ = *_p3++;
683 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3)); *_v3++ = *_q3++; ,
684 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4)); *_u4++ = *_p4++;
685 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4)); *_v4++ = *_q4++; ,
686 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5)); *_u5++ = *_p5++;
687 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5)); *_v5++ = *_q5++; ,
688 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6)); *_u6++ = *_p6++;
689 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6)); *_v6++ = *_q6++; ,
690 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7)); *_u7++ = *_p7++;
691 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7)); *_v7++ = *_q7++; )
696 *_et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
697 *_mk = _mm_and_ps(_mm_cmpgt_ps(*_et,_En),_1);
698 _NN = _mm_add_ps(_NN,*_mk);
699 _ee = _mm_add_ps(_ee,*_et);
700 *_et = _mm_mul_ps(*_et,*_mk++);
701 _EE = _mm_add_ps(_EE,*_et++);
708 float** d,
float**
D,
709 float**
h,
float** H,
int I) {
713 NETX(__m128* _n0 = (__m128*)n[0]; __m128* _N0 = (__m128*)N[0]; ,
714 __m128* _n1 = (__m128*)n[1]; __m128* _N1 = (__m128*)N[1]; ,
715 __m128* _n2 = (__m128*)n[2]; __m128* _N2 = (__m128*)N[2]; ,
716 __m128* _n3 = (__m128*)n[3]; __m128* _N3 = (__m128*)N[3]; ,
717 __m128* _n4 = (__m128*)n[4]; __m128* _N4 = (__m128*)N[4]; ,
718 __m128* _n5 = (__m128*)n[5]; __m128* _N5 = (__m128*)N[5]; ,
719 __m128* _n6 = (__m128*)n[6]; __m128* _N6 = (__m128*)N[6]; ,
720 __m128* _n7 = (__m128*)n[7]; __m128* _N7 = (__m128*)N[7]; )
722 NETX(__m128* _d0 = (__m128*)d[0]; __m128* _D0 = (__m128*)D[0]; ,
723 __m128* _d1 = (__m128*)d[1]; __m128* _D1 = (__m128*)D[1]; ,
724 __m128* _d2 = (__m128*)d[2]; __m128* _D2 = (__m128*)D[2]; ,
725 __m128* _d3 = (__m128*)d[3]; __m128* _D3 = (__m128*)D[3]; ,
726 __m128* _d4 = (__m128*)d[4]; __m128* _D4 = (__m128*)D[4]; ,
727 __m128* _d5 = (__m128*)d[5]; __m128* _D5 = (__m128*)D[5]; ,
728 __m128* _d6 = (__m128*)d[6]; __m128* _D6 = (__m128*)D[6]; ,
729 __m128* _d7 = (__m128*)d[7]; __m128* _D7 = (__m128*)D[7]; )
731 NETX(__m128* _h0 = (__m128*)h[0]; __m128* _H0 = (__m128*)H[0]; ,
732 __m128* _h1 = (__m128*)h[1]; __m128* _H1 = (__m128*)H[1]; ,
733 __m128* _h2 = (__m128*)h[2]; __m128* _H2 = (__m128*)H[2]; ,
734 __m128* _h3 = (__m128*)h[3]; __m128* _H3 = (__m128*)H[3]; ,
735 __m128* _h4 = (__m128*)h[4]; __m128* _H4 = (__m128*)H[4]; ,
736 __m128* _h5 = (__m128*)h[5]; __m128* _H5 = (__m128*)H[5]; ,
737 __m128* _h6 = (__m128*)h[6]; __m128* _H6 = (__m128*)H[6]; ,
738 __m128* _h7 = (__m128*)h[7]; __m128* _H7 = (__m128*)H[7]; )
740 for(
int i=0;
i<I;
i+=4) {
741 NETX(*_n0++ = _mm_sub_ps(*_d0++,*_h0++); *_N0++ = _mm_sub_ps(*_D0++,*_H0++); ,
742 *_n1++ = _mm_sub_ps(*_d1++,*_h1++); *_N1++ = _mm_sub_ps(*_D1++,*_H1++); ,
743 *_n2++ = _mm_sub_ps(*_d2++,*_h2++); *_N2++ = _mm_sub_ps(*_D2++,*_H2++); ,
744 *_n3++ = _mm_sub_ps(*_d3++,*_h3++); *_N3++ = _mm_sub_ps(*_D3++,*_H3++); ,
745 *_n4++ = _mm_sub_ps(*_d4++,*_h4++); *_N4++ = _mm_sub_ps(*_D4++,*_H4++); ,
746 *_n5++ = _mm_sub_ps(*_d5++,*_h5++); *_N5++ = _mm_sub_ps(*_D5++,*_H5++); ,
747 *_n6++ = _mm_sub_ps(*_d6++,*_h6++); *_N6++ = _mm_sub_ps(*_D6++,*_H6++); ,
748 *_n7++ = _mm_sub_ps(*_d7++,*_h7++); *_N7++ = _mm_sub_ps(*_D7++,*_H7++); )
755 std::vector<float*> &pAVX,
int I) {
762 __m128 _a, _A, _aa, _AA, _aA, _x, _cc, _ss, _nn, _cn, _sn, _mk;
764 static const __m128 _0 = _mm_set1_ps(0);
765 static const __m128 _1 = _mm_set1_ps(1);
766 static const __m128 _2 = _mm_set1_ps(2);
767 static const __m128 _o = _mm_set1_ps(0.0001);
768 static const __m128
sm = _mm_set1_ps(-0.
f);
770 float a[8] __attribute__((aligned(32))); __m128* _am = (__m128*)
a;
771 float A[8] __attribute__((aligned(32))); __m128* _AM = (__m128*)
A;
772 float s[8] __attribute__((aligned(32))); __m128* _si = (__m128*)
s;
773 float c[8] __attribute__((aligned(32))); __m128* _co = (__m128*)
c;
775 __m128* _MK = (__m128*)(pAVX[1]);
776 __m128 aa[8], AA[8], aA[8], si[8], co[8];
777 for(
int i=0;
i<8;
i++) {
778 aa[
i] = _mm_setzero_ps();
779 AA[
i] = _mm_setzero_ps();
780 aA[
i] = _mm_setzero_ps();
783 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
784 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
785 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
786 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
787 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
788 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
789 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
790 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
792 for(
int i=0;
i<I;
i+=4) {
793 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1);
795 aa[0] = _mm_add_ps(aa[0],_mm_mul_ps(_mm_mul_ps(*_p0,*_p0), _mk));
796 AA[0] = _mm_add_ps(AA[0],_mm_mul_ps(_mm_mul_ps(*_q0,*_q0), _mk));
797 aA[0] = _mm_add_ps(aA[0],_mm_mul_ps(_mm_mul_ps(*_p0++,*_q0++),_mk));,
799 aa[1] = _mm_add_ps(aa[1],_mm_mul_ps(_mm_mul_ps(*_p1,*_p1), _mk));
800 AA[1] = _mm_add_ps(AA[1],_mm_mul_ps(_mm_mul_ps(*_q1,*_q1), _mk));
801 aA[1] = _mm_add_ps(aA[1],_mm_mul_ps(_mm_mul_ps(*_p1++,*_q1++),_mk));,
803 aa[2] = _mm_add_ps(aa[2],_mm_mul_ps(_mm_mul_ps(*_p2,*_p2), _mk));
804 AA[2] = _mm_add_ps(AA[2],_mm_mul_ps(_mm_mul_ps(*_q2,*_q2), _mk));
805 aA[2] = _mm_add_ps(aA[2],_mm_mul_ps(_mm_mul_ps(*_p2++,*_q2++),_mk));,
807 aa[3] = _mm_add_ps(aa[3],_mm_mul_ps(_mm_mul_ps(*_p3,*_p3), _mk));
808 AA[3] = _mm_add_ps(AA[3],_mm_mul_ps(_mm_mul_ps(*_q3,*_q3), _mk));
809 aA[3] = _mm_add_ps(aA[3],_mm_mul_ps(_mm_mul_ps(*_p3++,*_q3++),_mk));,
811 aa[4] = _mm_add_ps(aa[4],_mm_mul_ps(_mm_mul_ps(*_p4,*_p4), _mk));
812 AA[4] = _mm_add_ps(AA[4],_mm_mul_ps(_mm_mul_ps(*_q4,*_q4), _mk));
813 aA[4] = _mm_add_ps(aA[4],_mm_mul_ps(_mm_mul_ps(*_p4++,*_q4++),_mk));,
815 aa[5] = _mm_add_ps(aa[5],_mm_mul_ps(_mm_mul_ps(*_p5,*_p5), _mk));
816 AA[5] = _mm_add_ps(AA[5],_mm_mul_ps(_mm_mul_ps(*_q5,*_q5), _mk));
817 aA[5] = _mm_add_ps(aA[5],_mm_mul_ps(_mm_mul_ps(*_p5++,*_q5++),_mk));,
819 aa[6] = _mm_add_ps(aa[6],_mm_mul_ps(_mm_mul_ps(*_p6,*_p6), _mk));
820 AA[6] = _mm_add_ps(AA[6],_mm_mul_ps(_mm_mul_ps(*_q6,*_q6), _mk));
821 aA[6] = _mm_add_ps(aA[6],_mm_mul_ps(_mm_mul_ps(*_p6++,*_q6++),_mk));,
823 aa[7] = _mm_add_ps(aa[7],_mm_mul_ps(_mm_mul_ps(*_p7,*_p7), _mk));
824 AA[7] = _mm_add_ps(AA[7],_mm_mul_ps(_mm_mul_ps(*_q7,*_q7), _mk));
825 aA[7] = _mm_add_ps(aA[7],_mm_mul_ps(_mm_mul_ps(*_p7++,*_q7++),_mk));)
829 _a = _mm_hadd_ps(aa[0],aa[1]); _A = _mm_hadd_ps(aa[2],aa[3]); _aa = _mm_hadd_ps(_a,_A);
830 _a = _mm_hadd_ps(AA[0],AA[1]); _A = _mm_hadd_ps(AA[2],AA[3]); _AA = _mm_hadd_ps(_a,_A);
831 _a = _mm_hadd_ps(aA[0],aA[1]); _A = _mm_hadd_ps(aA[2],aA[3]); _aA = _mm_hadd_ps(_a,_A);
833 *_si = _mm_mul_ps(_aA,_2);
834 *_co = _mm_sub_ps(_aa,_AA);
835 _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
836 _cc = _mm_mul_ps(*_co,*_co);
837 _ss = _mm_mul_ps(*_si,*_si);
838 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
839 *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2));
840 *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2);
841 *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM));
842 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
843 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
844 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
845 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
846 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
847 *_co = _mm_mul_ps(*_co,_ss);
850 _am++; _AM++; _si++; _co++;
851 _a = _mm_hadd_ps(aa[4],aa[5]); _A = _mm_hadd_ps(aa[6],aa[7]); _aa = _mm_hadd_ps(_a,_A);
852 _a = _mm_hadd_ps(AA[4],AA[5]); _A = _mm_hadd_ps(AA[6],AA[7]); _AA = _mm_hadd_ps(_a,_A);
853 _a = _mm_hadd_ps(aA[4],aA[5]); _A = _mm_hadd_ps(aA[6],aA[7]); _aA = _mm_hadd_ps(_a,_A);
855 *_si = _mm_mul_ps(_aA,_2);
856 *_co = _mm_sub_ps(_aa,_AA);
857 _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
858 _cc = _mm_mul_ps(*_co,*_co);
859 _ss = _mm_mul_ps(*_si,*_si);
860 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
861 *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2));
862 *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2);
863 *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM));
864 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
865 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
866 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
867 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
868 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
869 *_co = _mm_mul_ps(*_co,_ss);
874 NETX(q[0][II]=
a[0]; q[0][II+1]=
A[0]; q[0][II+2]=
s[0]; q[0][II+3]=
c[0]; q[0][II+5]=1;,
875 q[1][II]=
a[1]; q[1][II+1]=
A[1]; q[1][II+2]=
s[1]; q[1][II+3]=
c[1]; q[1][II+5]=1;,
876 q[2][II]=
a[2]; q[2][II+1]=
A[2]; q[2][II+2]=
s[2]; q[2][II+3]=
c[2]; q[2][II+5]=1;,
877 q[3][II]=
a[3]; q[3][II+1]=
A[3]; q[3][II+2]=
s[3]; q[3][II+3]=
c[3]; q[3][II+5]=1;,
878 q[4][II]=
a[4]; q[4][II+1]=
A[4]; q[4][II+2]=
s[4]; q[4][II+3]=
c[4]; q[4][II+5]=1;,
879 q[5][II]=
a[5]; q[5][II+1]=
A[5]; q[5][II+2]=
s[5]; q[5][II+3]=
c[5]; q[5][II+5]=1;,
880 q[6][II]=
a[6]; q[6][II+1]=
A[6]; q[6][II+2]=
s[6]; q[6][II+3]=
c[6]; q[6][II+5]=1;,
881 q[7][II]=
a[7]; q[7][II+1]=
A[7]; q[7][II+2]=
s[7]; q[7][II+3]=
c[7]; q[7][II+5]=1;)
884 NETX(q[0][II+4]=(
a[0]+
A[0]); Ep+=q[0][II+4]*q[0][II+4]/2; _p0 = (__m128*)p[0]; _q0 = (__m128*)q[0];,
885 q[1][II+4]=(
a[1]+
A[1]); Ep+=q[1][II+4]*q[1][II+4]/2; _p1 = (__m128*)p[1]; _q1 = (__m128*)q[1];,
886 q[2][II+4]=(
a[2]+
A[2]); Ep+=q[2][II+4]*q[2][II+4]/2; _p2 = (__m128*)p[2]; _q2 = (__m128*)q[2];,
887 q[3][II+4]=(
a[3]+
A[3]); Ep+=q[3][II+4]*q[3][II+4]/2; _p3 = (__m128*)p[3]; _q3 = (__m128*)q[3];,
888 q[4][II+4]=(
a[4]+
A[4]); Ep+=q[4][II+4]*q[4][II+4]/2; _p4 = (__m128*)p[4]; _q4 = (__m128*)q[4];,
889 q[5][II+4]=(
a[5]+
A[5]); Ep+=q[5][II+4]*q[5][II+4]/2; _p5 = (__m128*)p[5]; _q5 = (__m128*)q[5];,
890 q[6][II+4]=(
a[6]+
A[6]); Ep+=q[6][II+4]*q[6][II+4]/2; _p6 = (__m128*)p[6]; _q6 = (__m128*)q[6];,
891 q[7][II+4]=(
a[7]+
A[7]); Ep+=q[7][II+4]*q[7][II+4]/2; _p7 = (__m128*)p[7]; _q7 = (__m128*)q[7];)
893 *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o)); _am--;
894 *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o));
895 *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o)); _AM--;
896 *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o));
901 NETX(aa[0]=_mm_set1_ps(
a[0]); AA[0]=_mm_set1_ps(
A[0]); si[0]=_mm_set1_ps(
s[0]); co[0]=_mm_set1_ps(
c[0]); ,
902 aa[1]=_mm_set1_ps(
a[1]); AA[1]=_mm_set1_ps(
A[1]); si[1]=_mm_set1_ps(
s[1]); co[1]=_mm_set1_ps(
c[1]); ,
903 aa[2]=_mm_set1_ps(
a[2]); AA[2]=_mm_set1_ps(
A[2]); si[2]=_mm_set1_ps(
s[2]); co[2]=_mm_set1_ps(
c[2]); ,
904 aa[3]=_mm_set1_ps(
a[3]); AA[3]=_mm_set1_ps(
A[3]); si[3]=_mm_set1_ps(
s[3]); co[3]=_mm_set1_ps(
c[3]); ,
905 aa[4]=_mm_set1_ps(
a[4]); AA[4]=_mm_set1_ps(
A[4]); si[4]=_mm_set1_ps(
s[4]); co[4]=_mm_set1_ps(
c[4]); ,
906 aa[5]=_mm_set1_ps(
a[5]); AA[5]=_mm_set1_ps(
A[5]); si[5]=_mm_set1_ps(
s[5]); co[5]=_mm_set1_ps(
c[5]); ,
907 aa[6]=_mm_set1_ps(
a[6]); AA[6]=_mm_set1_ps(
A[6]); si[6]=_mm_set1_ps(
s[6]); co[6]=_mm_set1_ps(
c[6]); ,
908 aa[7]=_mm_set1_ps(
a[7]); AA[7]=_mm_set1_ps(
A[7]); si[7]=_mm_set1_ps(
s[7]); co[7]=_mm_set1_ps(
c[7]); )
910 _MK = (__m128*)(pAVX[1]);
911 for(
int i=0;
i<I;
i+=4) {
912 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1);
913 NETX(_a=_mm_add_ps(_mm_mul_ps(*_p0,co[0]),_mm_mul_ps(*_q0,si[0]));
914 _A=_mm_sub_ps(_mm_mul_ps(*_q0,co[0]),_mm_mul_ps(*_p0,si[0]));
915 *_p0++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[0]));
916 *_q0++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[0])); ,
918 _a=_mm_add_ps(_mm_mul_ps(*_p1,co[1]),_mm_mul_ps(*_q1,si[1]));
919 _A=_mm_sub_ps(_mm_mul_ps(*_q1,co[1]),_mm_mul_ps(*_p1,si[1]));
920 *_p1++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[1]));
921 *_q1++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[1])); ,
923 _a=_mm_add_ps(_mm_mul_ps(*_p2,co[2]),_mm_mul_ps(*_q2,si[2]));
924 _A=_mm_sub_ps(_mm_mul_ps(*_q2,co[2]),_mm_mul_ps(*_p2,si[2]));
925 *_p2++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[2]));
926 *_q2++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[2])); ,
928 _a=_mm_add_ps(_mm_mul_ps(*_p3,co[3]),_mm_mul_ps(*_q3,si[3]));
929 _A=_mm_sub_ps(_mm_mul_ps(*_q3,co[3]),_mm_mul_ps(*_p3,si[3]));
930 *_p3++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[3]));
931 *_q3++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[3])); ,
933 _a=_mm_add_ps(_mm_mul_ps(*_p4,co[4]),_mm_mul_ps(*_q4,si[4]));
934 _A=_mm_sub_ps(_mm_mul_ps(*_q4,co[4]),_mm_mul_ps(*_p4,si[4]));
935 *_p4++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[4]));
936 *_q4++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[4])); ,
938 _a=_mm_add_ps(_mm_mul_ps(*_p5,co[5]),_mm_mul_ps(*_q5,si[5]));
939 _A=_mm_sub_ps(_mm_mul_ps(*_q5,co[5]),_mm_mul_ps(*_p5,si[5]));
940 *_p5++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[5]));
941 *_q5++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[5])); ,
943 _a=_mm_add_ps(_mm_mul_ps(*_p6,co[6]),_mm_mul_ps(*_q6,si[6]));
944 _A=_mm_sub_ps(_mm_mul_ps(*_q6,co[6]),_mm_mul_ps(*_p6,si[6]));
945 *_p6++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[6]));
946 *_q6++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[6])); ,
948 _a=_mm_add_ps(_mm_mul_ps(*_p7,co[7]),_mm_mul_ps(*_q7,si[7]));
949 _A=_mm_sub_ps(_mm_mul_ps(*_q7,co[7]),_mm_mul_ps(*_p7,si[7]));
950 *_p7++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[7]));
951 *_q7++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[7])); )
957 std::vector<float*> &pAVX,
int I) {
965 float k = pAVX[1][I];
967 __m128 _a, _A, _n, _mk, _nn;
968 static const __m128 _o = _mm_set1_ps(1.
e-9);
969 static const __m128 _0 = _mm_set1_ps(0);
970 static const __m128 _1 = _mm_set1_ps(1);
971 static const __m128 _4 = _mm_set1_ps(4);
972 static const __m128 o5 = _mm_set1_ps(0.5);
974 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; __m128* _n0 = (__m128*)(q[0]+I); ,
975 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; __m128* _n1 = (__m128*)(q[1]+I); ,
976 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; __m128* _n2 = (__m128*)(q[2]+I); ,
977 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; __m128* _n3 = (__m128*)(q[3]+I); ,
978 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; __m128* _n4 = (__m128*)(q[4]+I); ,
979 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; __m128* _n5 = (__m128*)(q[5]+I); ,
980 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; __m128* _n6 = (__m128*)(q[6]+I); ,
981 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; __m128* _n7 = (__m128*)(q[7]+I); )
983 NETX(__m128 a0=_mm_set1_ps(q[0][I4]); __m128 s0=_mm_set1_ps(q[0][I2]); __m128 c0=_mm_set1_ps(q[0][I3]); ,
984 __m128 a1=_mm_set1_ps(q[1][I4]); __m128 s1=_mm_set1_ps(q[1][I2]); __m128
c1=_mm_set1_ps(q[1][I3]); ,
985 __m128
a2=_mm_set1_ps(q[2][I4]); __m128 s2=_mm_set1_ps(q[2][I2]); __m128
c2=_mm_set1_ps(q[2][I3]); ,
986 __m128 a3=_mm_set1_ps(q[3][I4]); __m128 s3=_mm_set1_ps(q[3][I2]); __m128
c3=_mm_set1_ps(q[3][I3]); ,
987 __m128 a4=_mm_set1_ps(q[4][I4]); __m128 s4=_mm_set1_ps(q[4][I2]); __m128 c4=_mm_set1_ps(q[4][I3]); ,
988 __m128 a5=_mm_set1_ps(q[5][I4]); __m128 s5=_mm_set1_ps(q[5][I2]); __m128 c5=_mm_set1_ps(q[5][I3]); ,
989 __m128 a6=_mm_set1_ps(q[6][I4]); __m128 s6=_mm_set1_ps(q[6][I2]); __m128 c6=_mm_set1_ps(q[6][I3]); ,
990 __m128 a7=_mm_set1_ps(q[7][I4]); __m128 s7=_mm_set1_ps(q[7][I2]); __m128 c7=_mm_set1_ps(q[7][I3]); )
992 __m128* _MK = (__m128*)pAVX[1];
993 __m128* _fp = (__m128*)pAVX[2];
994 __m128* _fx = (__m128*)pAVX[3];
995 __m128* _ee = (__m128*)pAVX[15];
996 __m128* _EE = (__m128*)pAVX[16];
997 __m128* _gn = (__m128*)pAVX[20];
999 __m128 _Np = _mm_setzero_ps();
1001 for(
int i=0;
i<I;
i+=4) {
1002 _mk = _mm_mul_ps(o5,_mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1));
1004 _n = _mm_mul_ps(_mm_mul_ps(a0,_mk),*_n0); _a=*_p0; _A=*_q0; _nn=*_n0++;
1005 *_p0++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c0),_mm_mul_ps(_A,s0)));
1006 *_q0++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c0),_mm_mul_ps(_a,s0))); ,
1007 _n = _mm_mul_ps(_mm_mul_ps(a1,_mk),*_n1); _a=*_p1; _A=*_q1; _nn=_mm_add_ps(_nn,*_n1++);
1008 *_p1++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c1),_mm_mul_ps(_A,s1)));
1009 *_q1++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c1),_mm_mul_ps(_a,s1))); ,
1010 _n = _mm_mul_ps(_mm_mul_ps(a2,_mk),*_n2); _a=*_p2; _A=*_q2; _nn=_mm_add_ps(_nn,*_n2++);
1011 *_p2++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c2),_mm_mul_ps(_A,s2)));
1012 *_q2++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c2),_mm_mul_ps(_a,s2))); ,
1013 _n = _mm_mul_ps(_mm_mul_ps(a3,_mk),*_n3); _a=*_p3; _A=*_q3; _nn=_mm_add_ps(_nn,*_n3++);
1014 *_p3++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c3),_mm_mul_ps(_A,s3)));
1015 *_q3++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c3),_mm_mul_ps(_a,s3))); ,
1016 _n = _mm_mul_ps(_mm_mul_ps(a4,_mk),*_n4); _a=*_p4; _A=*_q4; _nn=_mm_add_ps(_nn,*_n4++);
1017 *_p4++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c4),_mm_mul_ps(_A,s4)));
1018 *_q4++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c4),_mm_mul_ps(_a,s4))); ,
1019 _n = _mm_mul_ps(_mm_mul_ps(a5,_mk),*_n5); _a=*_p5; _A=*_q5; _nn=_mm_add_ps(_nn,*_n5++);
1020 *_p5++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c5),_mm_mul_ps(_A,s5)));
1021 *_q5++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c5),_mm_mul_ps(_a,s5))); ,
1022 _n = _mm_mul_ps(_mm_mul_ps(a6,_mk),*_n6); _a=*_p6; _A=*_q6; _nn=_mm_add_ps(_nn,*_n6++);
1023 *_p6++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c6),_mm_mul_ps(_A,s6)));
1024 *_q6++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c6),_mm_mul_ps(_a,s6))); ,
1025 _n = _mm_mul_ps(_mm_mul_ps(a7,_mk),*_n7); _a=*_p7; _A=*_q7; _nn=_mm_add_ps(_nn,*_n7++);
1026 *_p7++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c7),_mm_mul_ps(_A,s7)));
1027 *_q7++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c7),_mm_mul_ps(_a,s7))); )
1029 _nn = _mm_mul_ps(_nn,_mk);
1030 _Np = _mm_add_ps(_Np,_nn);
1036 std::vector<float*> &pAVX,
int I) {
1043 __m128 _a,_A,_aa,_AA,_aA,_et,_cc,_ss,_nn,_cn,_sn,_mk;
1045 static const __m128 _0 = _mm_set1_ps(0);
1046 static const __m128 _1 = _mm_set1_ps(1);
1047 static const __m128 _2 = _mm_set1_ps(2);
1048 static const __m128 _o = _mm_set1_ps(1.
e-21);
1049 static const __m128
sm = _mm_set1_ps(-0.
f);
1051 __m128* _MK = (__m128*)pAVX[1];
1052 __m128* _si = (__m128*)pAVX[4];
1053 __m128* _co = (__m128*)pAVX[5];
1054 __m128* _ee = (__m128*)pAVX[15];
1055 __m128* _EE = (__m128*)pAVX[16];
1056 __m128 _e = _mm_setzero_ps();
1057 __m128 _E = _mm_setzero_ps();
1059 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
1060 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
1061 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
1062 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
1063 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
1064 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
1065 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
1066 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
1068 for(
int i=0;
i<I;
i+=4) {
1070 _aa=_mm_mul_ps(*_p0,*_p0);
1071 _AA=_mm_mul_ps(*_q0,*_q0);
1072 _aA=_mm_mul_ps(*_p0++,*_q0++); ,
1074 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1));
1075 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1));
1076 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p1++,*_q1++)); ,
1078 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2));
1079 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2));
1080 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p2++,*_q2++)); ,
1082 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3));
1083 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3));
1084 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p3++,*_q3++)); ,
1086 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4));
1087 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4));
1088 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p4++,*_q4++)); ,
1090 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5));
1091 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5));
1092 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p5++,*_q5++)); ,
1094 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6));
1095 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6));
1096 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p6++,*_q6++)); ,
1098 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7));
1099 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7));
1100 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p7++,*_q7++)); )
1104 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1);
1105 *_si = _mm_mul_ps(_aA,_2);
1106 *_co = _mm_sub_ps(_aa,_AA);
1107 _et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
1108 _cc = _mm_mul_ps(*_co,*_co);
1109 _ss = _mm_mul_ps(*_si,*_si);
1110 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
1111 *_ee = _mm_div_ps(_mm_add_ps(_et,_nn),_2);
1112 *_EE = _mm_div_ps(_mm_sub_ps(_et,_nn),_2);
1113 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
1114 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
1115 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
1116 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
1117 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
1118 *_co = _mm_mul_ps(*_co,_ss);
1120 _e = _mm_add_ps(_e,_mm_mul_ps(_mk,*_ee));
1121 _E = _mm_add_ps(_E,_mm_mul_ps(_mk,*_EE));
1122 _si++; _co++; _MK++; _ee++; _EE++;
1129 float**
s,
float**
S,
1130 std::vector<float*> &pAVX,
int I) {
1140 float k = pAVX[1][I];
1141 __m128 _a,_A,_x,_X,_c,_C,_s,_S,_r,_R,_rr,_RR,_xs,_XS;
1142 __m128 _ll,_ei,_cc,_mk,_mm,_ss,_SS;
1143 static const __m128 _o = _mm_set1_ps(0.001);
1144 static const __m128 _0 = _mm_set1_ps(0);
1145 static const __m128 _1 = _mm_set1_ps(1);
1146 static const __m128 _2 = _mm_set1_ps(2);
1147 static const __m128 _k = _mm_set1_ps(2*(1-k));
1148 static const __m128
sm = _mm_set1_ps(-0.
f);
1150 __m128* _et = (__m128*)pAVX[0];
1151 __m128* _MK = (__m128*)pAVX[1];
1152 __m128* _si = (__m128*)pAVX[4];
1153 __m128* _co = (__m128*)pAVX[5];
1154 __m128* _ec = (__m128*)pAVX[19];
1155 __m128* _gn = (__m128*)pAVX[20];
1156 __m128* _ed = (__m128*)pAVX[21];
1157 __m128* _rn = (__m128*)pAVX[22];
1159 __m128 _LL = _mm_setzero_ps();
1160 __m128 _Lr = _mm_setzero_ps();
1161 __m128 _EC = _mm_setzero_ps();
1162 __m128 _GN = _mm_setzero_ps();
1163 __m128 _RN = _mm_setzero_ps();
1164 __m128 _NN = _mm_setzero_ps();
1166 NETX(__m128* _x0 = (__m128*)x[0]; __m128* _X0 = (__m128*)X[0]; ,
1167 __m128* _x1 = (__m128*)x[1]; __m128* _X1 = (__m128*)X[1]; ,
1168 __m128* _x2 = (__m128*)x[2]; __m128* _X2 = (__m128*)X[2]; ,
1169 __m128* _x3 = (__m128*)x[3]; __m128* _X3 = (__m128*)X[3]; ,
1170 __m128* _x4 = (__m128*)x[4]; __m128* _X4 = (__m128*)X[4]; ,
1171 __m128* _x5 = (__m128*)x[5]; __m128* _X5 = (__m128*)X[5]; ,
1172 __m128* _x6 = (__m128*)x[6]; __m128* _X6 = (__m128*)X[6]; ,
1173 __m128* _x7 = (__m128*)x[7]; __m128* _X7 = (__m128*)X[7]; )
1175 NETX(__m128* _s0 = (__m128*)s[0]; __m128* _S0 = (__m128*)S[0]; ,
1176 __m128* _s1 = (__m128*)s[1]; __m128* _S1 = (__m128*)S[1]; ,
1177 __m128* _s2 = (__m128*)s[2]; __m128* _S2 = (__m128*)S[2]; ,
1178 __m128* _s3 = (__m128*)s[3]; __m128* _S3 = (__m128*)S[3]; ,
1179 __m128* _s4 = (__m128*)s[4]; __m128* _S4 = (__m128*)S[4]; ,
1180 __m128* _s5 = (__m128*)s[5]; __m128* _S5 = (__m128*)S[5]; ,
1181 __m128* _s6 = (__m128*)s[6]; __m128* _S6 = (__m128*)S[6]; ,
1182 __m128* _s7 = (__m128*)s[7]; __m128* _S7 = (__m128*)S[7]; )
1184 for(
int i=0;
i<I;
i+=4) {
1186 _s=_mm_add_ps(_mm_mul_ps(*_s0,*_co),_mm_mul_ps(*_S0,*_si)); _r=_mm_sub_ps(*_s0,*_x0);
1187 _x=_mm_add_ps(_mm_mul_ps(*_x0,*_co),_mm_mul_ps(*_X0,*_si)); _R=_mm_sub_ps(*_S0,*_X0);
1188 _S=_mm_sub_ps(_mm_mul_ps(*_S0++,*_co),_mm_mul_ps(*_s0++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_a;
1189 _X=_mm_sub_ps(_mm_mul_ps(*_X0++,*_co),_mm_mul_ps(*_x0++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_A;
1190 _c=_mm_mul_ps(_a,_a); _ss=_mm_mul_ps(_s,_s); _rr=_mm_mul_ps(_r,_r);
1191 _C=_mm_mul_ps(_A,_A); _SS=_mm_mul_ps(_S,_S); _RR=_mm_mul_ps(_R,_R); ,
1193 _s=_mm_add_ps(_mm_mul_ps(*_s1,*_co), _mm_mul_ps(*_S1,*_si)); _r=_mm_sub_ps(*_s1,*_x1);
1194 _x=_mm_add_ps(_mm_mul_ps(*_x1,*_co), _mm_mul_ps(*_X1,*_si)); _R=_mm_sub_ps(*_S1,*_X1);
1195 _S=_mm_sub_ps(_mm_mul_ps(*_S1++,*_co),_mm_mul_ps(*_s1++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1196 _X=_mm_sub_ps(_mm_mul_ps(*_X1++,*_co),_mm_mul_ps(*_x1++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1197 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1198 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1200 _s=_mm_add_ps(_mm_mul_ps(*_s2,*_co), _mm_mul_ps(*_S2,*_si)); _r=_mm_sub_ps(*_s2,*_x2);
1201 _x=_mm_add_ps(_mm_mul_ps(*_x2,*_co), _mm_mul_ps(*_X2,*_si)); _R=_mm_sub_ps(*_S2,*_X2);
1202 _S=_mm_sub_ps(_mm_mul_ps(*_S2++,*_co),_mm_mul_ps(*_s2++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1203 _X=_mm_sub_ps(_mm_mul_ps(*_X2++,*_co),_mm_mul_ps(*_x2++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1204 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1205 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1207 _s=_mm_add_ps(_mm_mul_ps(*_s3,*_co), _mm_mul_ps(*_S3,*_si)); _r=_mm_sub_ps(*_s3,*_x3);
1208 _x=_mm_add_ps(_mm_mul_ps(*_x3,*_co), _mm_mul_ps(*_X3,*_si)); _R=_mm_sub_ps(*_S3,*_X3);
1209 _S=_mm_sub_ps(_mm_mul_ps(*_S3++,*_co),_mm_mul_ps(*_s3++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1210 _X=_mm_sub_ps(_mm_mul_ps(*_X3++,*_co),_mm_mul_ps(*_x3++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1211 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1212 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1214 _s=_mm_add_ps(_mm_mul_ps(*_s4,*_co), _mm_mul_ps(*_S4,*_si)); _r=_mm_sub_ps(*_s4,*_x4);
1215 _x=_mm_add_ps(_mm_mul_ps(*_x4,*_co), _mm_mul_ps(*_X4,*_si)); _R=_mm_sub_ps(*_S4,*_X4);
1216 _S=_mm_sub_ps(_mm_mul_ps(*_S4++,*_co),_mm_mul_ps(*_s4++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1217 _X=_mm_sub_ps(_mm_mul_ps(*_X4++,*_co),_mm_mul_ps(*_x4++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1218 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1219 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1221 _s=_mm_add_ps(_mm_mul_ps(*_s5,*_co), _mm_mul_ps(*_S5,*_si)); _r=_mm_sub_ps(*_s5,*_x5);
1222 _x=_mm_add_ps(_mm_mul_ps(*_x5,*_co), _mm_mul_ps(*_X5,*_si)); _R=_mm_sub_ps(*_S5,*_X5);
1223 _S=_mm_sub_ps(_mm_mul_ps(*_S5++,*_co),_mm_mul_ps(*_s5++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1224 _X=_mm_sub_ps(_mm_mul_ps(*_X5++,*_co),_mm_mul_ps(*_x5++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1225 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1226 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1228 _s=_mm_add_ps(_mm_mul_ps(*_s6,*_co), _mm_mul_ps(*_S6,*_si)); _r=_mm_sub_ps(*_s6,*_x6);
1229 _x=_mm_add_ps(_mm_mul_ps(*_x6,*_co), _mm_mul_ps(*_X6,*_si)); _R=_mm_sub_ps(*_S6,*_X6);
1230 _S=_mm_sub_ps(_mm_mul_ps(*_S6++,*_co),_mm_mul_ps(*_s6++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1231 _X=_mm_sub_ps(_mm_mul_ps(*_X6++,*_co),_mm_mul_ps(*_x6++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1232 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1233 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1235 _s=_mm_add_ps(_mm_mul_ps(*_s7,*_co), _mm_mul_ps(*_S7,*_si)); _r=_mm_sub_ps(*_s7,*_x7);
1236 _x=_mm_add_ps(_mm_mul_ps(*_x7,*_co), _mm_mul_ps(*_X7,*_si)); _R=_mm_sub_ps(*_S7,*_X7);
1237 _S=_mm_sub_ps(_mm_mul_ps(*_S7++,*_co),_mm_mul_ps(*_s7++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1238 _X=_mm_sub_ps(_mm_mul_ps(*_X7++,*_co),_mm_mul_ps(*_x7++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1239 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1240 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); )
1243 _mk = _mm_and_ps(_mm_cmpge_ps(*_MK,_0),_1);
1244 _c = _mm_div_ps(_c,_mm_add_ps(_mm_mul_ps(_xs,_xs),_o));
1245 _C = _mm_div_ps(_C,_mm_add_ps(_mm_mul_ps(_XS,_XS),_o));
1246 _ll = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS));
1247 _ss = _mm_mul_ps(_ss,_mm_sub_ps(_1,_c));
1248 _SS = _mm_mul_ps(_SS,_mm_sub_ps(_1,_C));
1249 *_ec = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS));
1250 *_gn = _mm_mul_ps(_mk,_mm_mul_ps(_2,*_MK));
1251 *_rn = _mm_mul_ps(_mk,_mm_add_ps(_rr,_RR));
1253 _a = _mm_mul_ps(_2,_mm_andnot_ps(sm,*_ec));
1254 _A = _mm_add_ps(*_rn,_mm_add_ps(_o,*_gn));
1255 _cc = _mm_div_ps(*_ec,_mm_add_ps(_a,_A));
1256 _Lr = _mm_add_ps(_Lr,_mm_mul_ps(_ll,_cc));
1257 _mm = _mm_and_ps(_mm_cmpgt_ps(*_ec,_o),_1);
1259 _LL = _mm_add_ps(_LL, _ll);
1260 _GN = _mm_add_ps(_GN,*_gn);
1261 _EC = _mm_add_ps(_EC,*_ec);
1262 _RN = _mm_add_ps(_RN,*_rn);
1263 _NN = _mm_add_ps(_NN, _mm);
1265 _si++; _co++; _MK++; _ec++; _gn++; _ed++; _rn++;
1271 float ch = rn/(k*nn+sqrt(nn))/2;
1272 return _mm256_set_ps(0,0,0,0,rn/2,nn,ec,2*cc);
1276 static inline __m256
_avx_noise_ps(
float** p,
float** q, std::vector<float*> &pAVX,
int I) {
1280 float k = pAVX[1][I];
1281 __m128 _nx,_ns,_nm,_mk,_gg,_rc,_nn;
1282 __m128* _et = (__m128*)pAVX[0];
1283 __m128* _MK = (__m128*)pAVX[1];
1284 __m128* _ec = (__m128*)pAVX[19];
1285 __m128* _gn = (__m128*)pAVX[20];
1286 __m128* _rn = (__m128*)pAVX[22];
1287 __m128 _GN = _mm_setzero_ps();
1288 __m128 _RC = _mm_setzero_ps();
1289 __m128 _ES = _mm_setzero_ps();
1290 __m128 _EH = _mm_setzero_ps();
1291 __m128 _EC = _mm_setzero_ps();
1292 __m128 _SC = _mm_setzero_ps();
1293 __m128 _NC = _mm_setzero_ps();
1294 __m128 _NS = _mm_setzero_ps();
1296 static const __m128 _0 = _mm_set1_ps(0);
1297 static const __m128 o5 = _mm_set1_ps(0.5);
1298 static const __m128 _o = _mm_set1_ps(1.
e-9);
1299 static const __m128 _1 = _mm_set1_ps(1);
1300 static const __m128 _2 = _mm_set1_ps(2);
1301 static const __m128 _k = _mm_set1_ps(1./k);
1303 NETX(__m128* _s0 = (__m128*)(p[0]+I);, __m128* _s1 = (__m128*)(p[1]+I);,
1304 __m128* _s2 = (__m128*)(p[2]+I);, __m128* _s3 = (__m128*)(p[3]+I);,
1305 __m128* _s4 = (__m128*)(p[4]+I);, __m128* _s5 = (__m128*)(p[5]+I);,
1306 __m128* _s6 = (__m128*)(p[6]+I);, __m128* _s7 = (__m128*)(p[7]+I);)
1308 NETX(__m128* _x0 = (__m128*)(q[0]+I);, __m128* _x1 = (__m128*)(q[1]+I);,
1309 __m128* _x2 = (__m128*)(q[2]+I);, __m128* _x3 = (__m128*)(q[3]+I);,
1310 __m128* _x4 = (__m128*)(q[4]+I);, __m128* _x5 = (__m128*)(q[5]+I);,
1311 __m128* _x6 = (__m128*)(q[6]+I);, __m128* _x7 = (__m128*)(q[7]+I);)
1313 for(
int i=0;
i<I;
i+=4) {
1314 NETX(_ns =*_s0++;, _ns = _mm_add_ps(*_s1++,_ns); ,
1315 _ns = _mm_add_ps(*_s2++,_ns);, _ns = _mm_add_ps(*_s3++,_ns); ,
1316 _ns = _mm_add_ps(*_s4++,_ns);, _ns = _mm_add_ps(*_s5++,_ns); ,
1317 _ns = _mm_add_ps(*_s6++,_ns);, _ns = _mm_add_ps(*_s7++,_ns); )
1319 NETX(_nx =*_x0++;, _nx = _mm_add_ps(*_x1++,_nx); ,
1320 _nx = _mm_add_ps(*_x2++,_nx);, _nx = _mm_add_ps(*_x3++,_nx); ,
1321 _nx = _mm_add_ps(*_x4++,_nx);, _nx = _mm_add_ps(*_x5++,_nx); ,
1322 _nx = _mm_add_ps(*_x6++,_nx);, _nx = _mm_add_ps(*_x7++,_nx); )
1324 _ns = _mm_mul_ps(_ns,_k);
1325 _nx = _mm_mul_ps(_nx,_k);
1326 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1);
1327 _nm = _mm_and_ps(_mm_cmpgt_ps(_nx,_0),_1);
1328 _nm = _mm_mul_ps(_mk,_nm);
1329 _EC = _mm_add_ps(_EC,_mm_mul_ps(_nm,*_ec));
1330 _NC = _mm_add_ps(_NC,_nm);
1332 _nm = _mm_sub_ps(_mk,_nm);
1333 _ES = _mm_add_ps(_ES,_mm_mul_ps(_nm,*_rn));
1335 _rc = _mm_and_ps(_mm_cmplt_ps(*_gn,_2),_1);
1336 _nn = _mm_mul_ps(*_gn,_mm_sub_ps(_1,_rc));
1337 _rc = _mm_add_ps(_rc,_mm_mul_ps(_nn,o5));
1338 _rc = _mm_div_ps(*_ec,_mm_add_ps(_rc,_o));
1340 _nm = _mm_and_ps(_mm_cmpgt_ps(_ns,_0),_1);
1341 _nm = _mm_mul_ps(_mk,_nm);
1342 *_gn = _mm_mul_ps(_mk,_mm_mul_ps(*_gn,_nx));
1343 _SC = _mm_add_ps(_SC,_mm_mul_ps(_nm,*_ec));
1344 _RC = _mm_add_ps(_RC,_mm_mul_ps(_nm, _rc));
1345 _GN = _mm_add_ps(_GN,_mm_mul_ps(_nm,*_gn));
1346 _NS = _mm_add_ps(_NS,_nm);
1348 _nm = _mm_sub_ps(_mk,_nm);
1349 _EH = _mm_add_ps(_EH,_mm_mul_ps(_nm,*_et));
1351 _MK++; _gn++; _et++; _ec++; _rn++;
1361 return _mm256_set_ps(ns,nc,es,eh,rc/(sc+0.01),sc-ec,ec,gn);
static void _avx_pol_ps(float **p, float **q, wavearray< double > *pol00, wavearray< double > *pol90, std::vector< float *> &pAPN, std::vector< float *> &pAVX, int II)
wavearray< double > a(hp.size())
static float _avx_dpf_ps(double **Fp, double **Fx, int l, std::vector< float *> &pAPN, std::vector< float *> &pAVX, int I)
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\ layers : "<< nLAYERS<< "\ dF(hz) : "<< dF<< "\ dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1) *itime+ifreq;double time=itime *dT;double freq=(ifreq >0) ? ifreq *dF :dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static __m256 _avx_stat_ps(float **x, float **X, float **s, float **S, std::vector< float *> &pAVX, int I)
cout<< "SNR "<< xsnr<< endl;wavearray< double > f
static float _avx_GW_ps(float **p, float **q, std::vector< float *> &pAPN, float *rr, std::vector< float *> &pAVX, int II)
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _avx_packet_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
static void _avx_free_ps(std::vector< float *> &v)
static void _avx_loadNULL_ps(float **n, float **N, float **d, float **D, float **h, float **H, int I)
static void _avx_cpf_ps(float **p, float **q, float **u, float **v, int I)
static float _avx_ort_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
static float _avx_setAMP_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
static float _wat_hsum(__m128 y)
Meyer< double > S(1024, 2)
static float _avx_loadata_ps(float **p, float **q, float **u, float **v, float En, std::vector< float *> &pAVX, int I)
static __m256 _avx_noise_ps(float **p, float **q, std::vector< float *> &pAVX, int I)