Logo coherent WaveBurst  
Library Reference Guide
Logo
watavx.hh
Go to the documentation of this file.
1 /*
2 # Copyright (C) 2019 Sergey Klimenko, Gabriele Vedovato
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17 
18 
19 // Wavelet Analysis Tool
20 // S.Klimenko, University of Florida
21 // library of general functions
22 
23 #ifndef WATAVX_HH
24 #define WATAVX_HH
25 
26 #include <xmmintrin.h>
27 #include <pmmintrin.h> // need for hadd
28 #include <immintrin.h> // AVX
29 //#include <x86intrin.h>
30 #include <iostream>
31 #include <stdlib.h>
32 #include <math.h>
33 #include <TMath.h>
34 #include "wat.hh"
35 #include "wavearray.hh"
36 
37 // WAT AVX functions
38 
39 
40 static inline void _avx_free_ps(std::vector<float*> &v) {
41  for(int n=0; n<v.size(); n++) _mm_free(v[n]);
42  v.clear(); std::vector<float*>().swap(v);
43 }
44 static inline float _wat_hsum(__m128 y) {
45  float x[4]; _mm_storeu_ps(x,y);
46  return x[0]+x[1]+x[2]+x[3];
47 }
48 
49 static inline void _avx_cpf_ps(float** p, float ** q,
50  float** u, float ** v, int I) {
51 // copy arrays from p-->u and from q-->v
52  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
53  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
54  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
55  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
56  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
57  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
58  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
59  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
60 
61  NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
62  __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
63  __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
64  __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
65  __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
66  __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
67  __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
68  __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
69 
70  for(int i=0; i<I; i+=4) {
71  NETX(*_u0++ = *_p0++; *_v0++ = *_q0++; ,
72  *_u1++ = *_p1++; *_v1++ = *_q1++; ,
73  *_u2++ = *_p2++; *_v2++ = *_q2++; ,
74  *_u3++ = *_p3++; *_v3++ = *_q3++; ,
75  *_u4++ = *_p4++; *_v4++ = *_q4++; ,
76  *_u5++ = *_p5++; *_v5++ = *_q5++; ,
77  *_u6++ = *_p6++; *_v6++ = *_q6++; ,
78  *_u7++ = *_p7++; *_v7++ = *_q7++; )
79  }
80  return;
81 }
82 
83 
84 static inline float _avx_dpf_ps(double** Fp, double** Fx, int l,
85  std::vector<float*> &pAPN,
86  std::vector<float*> &pAVX, int I) {
87 // calculate dpf sin and cos and antenna f+ and fx amplitudes
88 // Fp, Fx array of pointers for antenna pattrens
89 // l - sky location index
90 // pRMS - vector with noise rms data
91 // pAVX - vector with pixel statistics
92 // in likelihoodWP these arrays should be stored exactly in the same order.
93 // this function increments pointers stored in tne input pointer arrays.
94  __m128* _fp = (__m128*)pAVX[2];
95  __m128* _fx = (__m128*)pAVX[3];
96  __m128* _si = (__m128*)pAVX[4];
97  __m128* _co = (__m128*)pAVX[5];
98  __m128* _ni = (__m128*)pAVX[18];
99  __m128 _ff,_FF,_fF,_AP,_cc,_ss,_nn,_cn,_sn;
100  static const __m128 _0 = _mm_set1_ps(0);
101  static const __m128 _1 = _mm_set1_ps(1);
102  static const __m128 _2 = _mm_set1_ps(2);
103  static const __m128 _o = _mm_set1_ps(0.0001);
104  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
105  __m128 _NI= _mm_setzero_ps();
106  __m128 _NN= _mm_setzero_ps();
107 
108  NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I); __m128* _r0=(__m128*)(pAPN[0]+2*I);,
109  __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I); __m128* _r1=(__m128*)(pAPN[1]+2*I);,
110  __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I); __m128* _r2=(__m128*)(pAPN[2]+2*I);,
111  __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I); __m128* _r3=(__m128*)(pAPN[3]+2*I);,
112  __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I); __m128* _r4=(__m128*)(pAPN[4]+2*I);,
113  __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I); __m128* _r5=(__m128*)(pAPN[5]+2*I);,
114  __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I); __m128* _r6=(__m128*)(pAPN[6]+2*I);,
115  __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I); __m128* _r7=(__m128*)(pAPN[7]+2*I);)
116 
117  NETX(__m128 _Fp0 = _mm_set1_ps(*(Fp[0]+l)); __m128 _Fx0 = _mm_set1_ps(*(Fx[0]+l)); ,
118  __m128 _Fp1 = _mm_set1_ps(*(Fp[1]+l)); __m128 _Fx1 = _mm_set1_ps(*(Fx[1]+l)); ,
119  __m128 _Fp2 = _mm_set1_ps(*(Fp[2]+l)); __m128 _Fx2 = _mm_set1_ps(*(Fx[2]+l)); ,
120  __m128 _Fp3 = _mm_set1_ps(*(Fp[3]+l)); __m128 _Fx3 = _mm_set1_ps(*(Fx[3]+l)); ,
121  __m128 _Fp4 = _mm_set1_ps(*(Fp[4]+l)); __m128 _Fx4 = _mm_set1_ps(*(Fx[4]+l)); ,
122  __m128 _Fp5 = _mm_set1_ps(*(Fp[5]+l)); __m128 _Fx5 = _mm_set1_ps(*(Fx[5]+l)); ,
123  __m128 _Fp6 = _mm_set1_ps(*(Fp[6]+l)); __m128 _Fx6 = _mm_set1_ps(*(Fx[6]+l)); ,
124  __m128 _Fp7 = _mm_set1_ps(*(Fp[7]+l)); __m128 _Fx7 = _mm_set1_ps(*(Fx[7]+l)); )
125 
126  float sign=0;
127  NETX(if(*(pAPN[0]+2*I)>0.) sign+=*(Fp[0]+l) * *(Fx[0]+l);,
128  if(*(pAPN[1]+2*I)>0.) sign+=*(Fp[1]+l) * *(Fx[1]+l);,
129  if(*(pAPN[2]+2*I)>0.) sign+=*(Fp[2]+l) * *(Fx[2]+l);,
130  if(*(pAPN[3]+2*I)>0.) sign+=*(Fp[3]+l) * *(Fx[3]+l);,
131  if(*(pAPN[4]+2*I)>0.) sign+=*(Fp[4]+l) * *(Fx[4]+l);,
132  if(*(pAPN[5]+2*I)>0.) sign+=*(Fp[5]+l) * *(Fx[5]+l);,
133  if(*(pAPN[6]+2*I)>0.) sign+=*(Fp[6]+l) * *(Fx[6]+l);,
134  if(*(pAPN[7]+2*I)>0.) sign+=*(Fp[7]+l) * *(Fx[7]+l);)
135  sign = sign>0 ? 1. : -1.;
136  __m128 _sign = _mm_set1_ps(sign);
137 
138  for(int i=0; i<I; i+=4) {
139  NETX(
140  *_f0 = _mm_mul_ps(*_r0,_Fp0); *_F0 = _mm_mul_ps(*_r0++,_Fx0);
141  _ff = _mm_mul_ps(*_f0,*_f0);
142  _FF = _mm_mul_ps(*_F0,*_F0);
143  _fF = _mm_mul_ps(*_f0,*_F0); ,
144 
145  *_f1 = _mm_mul_ps(*_r1,_Fp1); *_F1 = _mm_mul_ps(*_r1++,_Fx1);
146  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f1,*_f1));
147  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F1,*_F1));
148  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f1,*_F1)); ,
149 
150  *_f2 = _mm_mul_ps(*_r2,_Fp2); *_F2 = _mm_mul_ps(*_r2++,_Fx2);
151  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f2,*_f2));
152  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F2,*_F2));
153  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f2,*_F2)); ,
154 
155  *_f3 = _mm_mul_ps(*_r3,_Fp3); *_F3 = _mm_mul_ps(*_r3++,_Fx3);
156  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f3,*_f3));
157  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F3,*_F3));
158  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f3,*_F3)); ,
159 
160  *_f4 = _mm_mul_ps(*_r4,_Fp4); *_F4 = _mm_mul_ps(*_r4++,_Fx4);
161  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f4,*_f4));
162  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F4,*_F4));
163  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f4,*_F4)); ,
164 
165  *_f5 = _mm_mul_ps(*_r5,_Fp5); *_F5 = _mm_mul_ps(*_r5++,_Fx5);
166  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f5,*_f5));
167  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F5,*_F5));
168  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f5,*_F5)); ,
169 
170  *_f6 = _mm_mul_ps(*_r6,_Fp6); *_F6 = _mm_mul_ps(*_r6++,_Fx6);
171  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f6,*_f6));
172  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F6,*_F6));
173  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f6,*_F6)); ,
174 
175  *_f7 = _mm_mul_ps(*_r7,_Fp7); *_F7 = _mm_mul_ps(*_r7++,_Fx7);
176  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f7,*_f7));
177  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F7,*_F7));
178  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f7,*_F7)); )
179 
180  *_si = _mm_mul_ps(_fF,_2); // rotation 2*sin*cos*norm
181  *_co = _mm_sub_ps(_ff,_FF); // rotation (cos^2-sin^2)*norm
182  _AP = _mm_add_ps(_ff,_FF); // total antenna norm
183  _cc = _mm_mul_ps(*_co,*_co);
184  _ss = _mm_mul_ps(*_si,*_si);
185  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
186  *_fp = _mm_div_ps(_mm_add_ps(_AP,_nn),_2); // |f+|^2
187  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
188  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
189  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
190  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
191  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
192  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
193 
194 // DPF antenna patterns
195  NETX(
196  _ff = _mm_add_ps(_mm_mul_ps(*_f0,*_co),_mm_mul_ps(*_F0,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
197  _FF = _mm_sub_ps(_mm_mul_ps(*_F0,*_co),_mm_mul_ps(*_f0,*_si)); *_f0 = _ff; *_F0 = _FF; // fx
198  _nn = _mm_mul_ps(_cc,_cc); _fF = _mm_mul_ps(_ff,_FF);, // ni
199 
200  _ff = _mm_add_ps(_mm_mul_ps(*_f1,*_co),_mm_mul_ps(*_F1,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
201  _FF = _mm_sub_ps(_mm_mul_ps(*_F1,*_co),_mm_mul_ps(*_f1,*_si)); *_f1 = _ff; *_F1 = _FF; // fx
202  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
203 
204  _ff = _mm_add_ps(_mm_mul_ps(*_f2,*_co),_mm_mul_ps(*_F2,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
205  _FF = _mm_sub_ps(_mm_mul_ps(*_F2,*_co),_mm_mul_ps(*_f2,*_si)); *_f2 = _ff; *_F2 = _FF; // fx
206  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
207 
208  _ff = _mm_add_ps(_mm_mul_ps(*_f3,*_co),_mm_mul_ps(*_F3,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
209  _FF = _mm_sub_ps(_mm_mul_ps(*_F3,*_co),_mm_mul_ps(*_f3,*_si)); *_f3 = _ff; *_F3 = _FF; // fx
210  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
211 
212  _ff = _mm_add_ps(_mm_mul_ps(*_f4,*_co),_mm_mul_ps(*_F4,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
213  _FF = _mm_sub_ps(_mm_mul_ps(*_F4,*_co),_mm_mul_ps(*_f4,*_si)); *_f4 = _ff; *_F4 = _FF; // fx
214  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
215 
216  _ff = _mm_add_ps(_mm_mul_ps(*_f5,*_co),_mm_mul_ps(*_F5,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
217  _FF = _mm_sub_ps(_mm_mul_ps(*_F5,*_co),_mm_mul_ps(*_f5,*_si)); *_f5 = _ff; *_F5 = _FF; // fx
218  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
219 
220  _ff = _mm_add_ps(_mm_mul_ps(*_f6,*_co),_mm_mul_ps(*_F6,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
221  _FF = _mm_sub_ps(_mm_mul_ps(*_F6,*_co),_mm_mul_ps(*_f6,*_si)); *_f6 = _ff; *_F6 = _FF; // fx
222  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
223 
224  _ff = _mm_add_ps(_mm_mul_ps(*_f7,*_co),_mm_mul_ps(*_F7,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
225  _FF = _mm_sub_ps(_mm_mul_ps(*_F7,*_co),_mm_mul_ps(*_f7,*_si)); *_f7 = _ff; *_F7 = _FF; // fx
226  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));) // ni
227 
228  _fF = _mm_div_ps(_fF,_mm_add_ps(*_fp,_o));
229 
230  NETX(
231  *_F0 = _mm_sub_ps(*_F0,_mm_mul_ps(*_f0++,_fF)); *_fx = _mm_mul_ps(*_F0,*_F0); _F0++;,
232  *_F1 = _mm_sub_ps(*_F1,_mm_mul_ps(*_f1++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F1,*_F1)); _F1++;,
233  *_F2 = _mm_sub_ps(*_F2,_mm_mul_ps(*_f2++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F2,*_F2)); _F2++;,
234  *_F3 = _mm_sub_ps(*_F3,_mm_mul_ps(*_f3++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F3,*_F3)); _F3++;,
235  *_F4 = _mm_sub_ps(*_F4,_mm_mul_ps(*_f4++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F4,*_F4)); _F4++;,
236  *_F5 = _mm_sub_ps(*_F5,_mm_mul_ps(*_f5++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F5,*_F5)); _F5++;,
237  *_F6 = _mm_sub_ps(*_F6,_mm_mul_ps(*_f6++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F6,*_F6)); _F6++;,
238  *_F7 = _mm_sub_ps(*_F7,_mm_mul_ps(*_f7++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F7,*_F7)); _F7++;)
239 
240  *_ni = _mm_div_ps(_nn,_mm_add_ps(_mm_mul_ps(*_fp,*_fp),_o)); // network index
241  _ff = _mm_add_ps(_mm_mul_ps(_1,*_ni),_o); // network index
242  _NI = _mm_add_ps(_NI,(_mm_div_ps(*_fx,_ff))); // sum of |fx|^2/2/ni
243  _NN = _mm_add_ps(_NN,_mm_and_ps(_mm_cmpgt_ps(*_fp,_0),_1)); // pixel count
244 
245  _fp++; _fx++; _si++; _co++; _ni++;
246  }
247  return sqrt(_wat_hsum(_NI)/(_wat_hsum(_NN)+0.01));
248 }
249 
250 
251 static inline float _avx_GW_ps(float** p, float ** q,
252  std::vector<float*> &pAPN, float* rr,
253  std::vector<float*> &pAVX, int II) {
254 // initialize GW strain amplitudes and impose regulator
255 // p,q - input - data vector
256 // pAVX - pixel statistics
257 // J - number of AVX pixels
258 // in likelihoodWP these arrays should be stored exactly in the same order.
259 // this function updates the definition of the mask array:
260 
261  int I = abs(II);
262 
263  __m128* _et = (__m128*)pAVX[0];
264  __m128* _mk = (__m128*)pAVX[1];
265  __m128* _fp = (__m128*)pAVX[2];
266  __m128* _fx = (__m128*)pAVX[3];
267  __m128* _au = (__m128*)pAVX[10];
268  __m128* _AU = (__m128*)pAVX[11];
269  __m128* _av = (__m128*)pAVX[12];
270  __m128* _AV = (__m128*)pAVX[13];
271  __m128* _ni = (__m128*)pAVX[18];
272 
273  __m128 _uu = _mm_setzero_ps();
274  __m128 _UU = _mm_setzero_ps();
275  __m128 _uU = _mm_setzero_ps();
276  __m128 _vv = _mm_setzero_ps();
277  __m128 _VV = _mm_setzero_ps();
278  __m128 _vV = _mm_setzero_ps();
279  __m128 _NN = _mm_setzero_ps();
280 
281  __m128 _h,_H,_f,_F,_R,_a,_nn,_m,_ff,_xp,_XP,_xx,_XX;
282  float cu,su,cv,sv,cc,ss,nn,uu,UU,vv,VV,et,ET;
283  static const __m128 _0 = _mm_set1_ps(0);
284  static const __m128 _1 = _mm_set1_ps(1);
285  static const __m128 o1 = _mm_set1_ps(0.1);
286  static const __m128 _o = _mm_set1_ps(1.e-5);
287  __m128 _rr = _mm_set1_ps(rr[0]);
288  __m128 _RR = _mm_set1_ps(rr[1]);
289 
290  // pointers to antenna patterns
291  NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
292  __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
293  __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
294  __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
295  __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
296  __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
297  __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
298  __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
299 
300  // pointers to data
301  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
302  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
303  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
304  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
305  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
306  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
307  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
308  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
309 
310  for(int i=0; i<I; i+=4) { // calculate scalar products
311 
312  NETX(
313  _xp = _mm_mul_ps(*_f0,*_p0); // (x,f+)
314  _XP = _mm_mul_ps(*_f0,*_q0); // (X,f+)
315  _xx = _mm_mul_ps(*_F0,*_p0); // (x,fx)
316  _XX = _mm_mul_ps(*_F0,*_q0); , // (X,fx)
317 
318  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,*_p1)); // (x,f+)
319  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,*_q1)); // (X,f+)
320  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,*_p1)); // (x,fx)
321  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,*_q1)); , // (X,fx)
322 
323  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,*_p2)); // (x,f+)
324  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,*_q2)); // (X,f+)
325  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,*_p2)); // (x,fx)
326  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,*_q2)); , // (X,fx)
327 
328  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,*_p3)); // (x,f+)
329  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,*_q3)); // (X,f+)
330  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,*_p3)); // (x,fx)
331  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,*_q3)); , // (X,fx)
332 
333  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,*_p4)); // (x,f+)
334  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,*_q4)); // (X,f+)
335  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,*_p4)); // (x,fx)
336  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,*_q4)); , // (X,fx)
337 
338  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,*_p5)); // (x,f+)
339  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,*_q5)); // (X,f+)
340  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,*_p5)); // (x,fx)
341  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,*_q5)); , // (X,fx)
342 
343  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,*_p6)); // (x,f+)
344  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,*_q6)); // (X,f+)
345  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,*_p6)); // (x,fx)
346  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,*_q6)); , // (X,fx)
347 
348  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,*_p7)); // (x,f+)
349  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,*_q7)); // (X,f+)
350  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,*_p7)); // (x,fx)
351  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,*_q7)); ) // (X,fx)
352 
353 // regulator
354 
355  _f = _mm_add_ps(_mm_mul_ps(_xp,_xp),_mm_mul_ps(_XP,_XP)); // f=(x,f+)^2+(X,f+)^2
356  _f = _mm_div_ps(_f,_mm_add_ps(*_et,_o)); // f=f/[ni*(|x|^2+|X|^2)]
357  _f = _mm_sqrt_ps(_mm_mul_ps(*_ni,_f)); // f=ni*sqrt[(x,f+)^2+(X,f+)^2/(|x|^2+|X|^2)]
358  _f = _mm_sub_ps(_mm_mul_ps(_f,_rr),*_fp); // f*R-|f+|^2
359  _f = _mm_mul_ps(_f,_mm_and_ps(_mm_cmpgt_ps(_f,_0),_1)); // f if f*R>|f+|^2 or 0 if f*R<|f+|^2
360  _f = _mm_div_ps(*_mk,_mm_add_ps(*_fp,_mm_add_ps(_o,_f))); // 1 / {|f+|^2+epsilon}
361 
362  _h = _mm_mul_ps(_xp,_f); // use inequality
363  _H = _mm_mul_ps(_XP,_f); // fx^2 *(s^2+e^2*c^2)/(1+e^2) < fx^2
364  _h = _mm_add_ps(_mm_mul_ps(_h,_h),_mm_mul_ps(_H,_H));
365  _H = _mm_add_ps(_mm_mul_ps(_xx,_xx),_mm_mul_ps(_XX,_XX)); // (x,fx)^2+(X,fx)^2
366  _F = _mm_sqrt_ps(_mm_div_ps(_H,_mm_add_ps(_h,_o)));
367  _R = _mm_add_ps(o1,_mm_div_ps(_RR,_mm_add_ps(*_et,_o))); // dynamic x-regulator
368  _F = _mm_sub_ps(_mm_mul_ps(_F,_R),*_fx); // F*R-|fx|^2
369  _F = _mm_mul_ps(_F,_mm_and_ps(_mm_cmpgt_ps(_F,_0),_1)); // F*R-|fx|^2 if F*R>|fx|^2 or 0 if F*R<|fx|^2
370  _F = _mm_div_ps(*_mk,_mm_add_ps(*_fx,_mm_add_ps(_o,_F))); // 1/ {|fx|^2+epsilon}
371 
372  *_au = _mm_mul_ps(_xp,_f);
373  *_AU = _mm_mul_ps(_XP,_f);
374  *_av = _mm_mul_ps(_xx,_F);
375  *_AV = _mm_mul_ps(_XX,_F);
376 
377  _a = _mm_add_ps(_mm_mul_ps(_f,*_fp),_mm_mul_ps(_F,*_fx)); // Gaussin noise correction
378  _NN = _mm_add_ps(_NN,*_mk); // number of pixels
379  *_mk = _mm_add_ps(_a,_mm_sub_ps(*_mk,_1)); // -1 - rejected, >=0 accepted
380 
381  _mk++; _et++; _fp++; _fx++; _ni++;
382 
383 // set GW amplitudes in wavelet domain
384  NETX(*_p0++ = _mm_add_ps(_mm_mul_ps(*_f0, *_au),_mm_mul_ps(*_F0 ,*_av));
385  *_q0++ = _mm_add_ps(_mm_mul_ps(*_f0++,*_AU),_mm_mul_ps(*_F0++,*_AV)); ,
386  *_p1++ = _mm_add_ps(_mm_mul_ps(*_f1 ,*_au),_mm_mul_ps(*_F1 ,*_av));
387  *_q1++ = _mm_add_ps(_mm_mul_ps(*_f1++,*_AU),_mm_mul_ps(*_F1++,*_AV)); ,
388  *_p2++ = _mm_add_ps(_mm_mul_ps(*_f2 ,*_au),_mm_mul_ps(*_F2 ,*_av));
389  *_q2++ = _mm_add_ps(_mm_mul_ps(*_f2++,*_AU),_mm_mul_ps(*_F2++,*_AV)); ,
390  *_p3++ = _mm_add_ps(_mm_mul_ps(*_f3 ,*_au),_mm_mul_ps(*_F3 ,*_av));
391  *_q3++ = _mm_add_ps(_mm_mul_ps(*_f3++,*_AU),_mm_mul_ps(*_F3++,*_AV)); ,
392  *_p4++ = _mm_add_ps(_mm_mul_ps(*_f4 ,*_au),_mm_mul_ps(*_F4 ,*_av));
393  *_q4++ = _mm_add_ps(_mm_mul_ps(*_f4++,*_AU),_mm_mul_ps(*_F4++,*_AV)); ,
394  *_p5++ = _mm_add_ps(_mm_mul_ps(*_f5 ,*_au),_mm_mul_ps(*_F5 ,*_av));
395  *_q5++ = _mm_add_ps(_mm_mul_ps(*_f5++,*_AU),_mm_mul_ps(*_F5++,*_AV)); ,
396  *_p6++ = _mm_add_ps(_mm_mul_ps(*_f6 ,*_au),_mm_mul_ps(*_F6 ,*_av));
397  *_q6++ = _mm_add_ps(_mm_mul_ps(*_f6++,*_AU),_mm_mul_ps(*_F6++,*_AV)); ,
398  *_p7++ = _mm_add_ps(_mm_mul_ps(*_f7 ,*_au),_mm_mul_ps(*_F7 ,*_av));
399  *_q7++ = _mm_add_ps(_mm_mul_ps(*_f7++,*_AU),_mm_mul_ps(*_F7++,*_AV)); )
400 
401  _uU = _mm_add_ps(_uU,_mm_mul_ps(*_au,*_AU));
402  _vV = _mm_add_ps(_vV,_mm_mul_ps(*_av,*_AV));
403  _uu = _mm_add_ps(_uu,_mm_mul_ps(*_au,*_au)); _au++;
404  _UU = _mm_add_ps(_UU,_mm_mul_ps(*_AU,*_AU)); _AU++;
405  _vv = _mm_add_ps(_vv,_mm_mul_ps(*_av,*_av)); _av++;
406  _VV = _mm_add_ps(_VV,_mm_mul_ps(*_AV,*_AV)); _AV++;
407  }
408  su = _wat_hsum(_uU); sv = _wat_hsum(_vV); // rotation sin*cos*norm
409  uu = _wat_hsum(_uu); vv = _wat_hsum(_vv); // 00 energy
410  UU = _wat_hsum(_UU); VV = _wat_hsum(_VV); // 90 energy
411 
412 // first packet
413  nn = sqrt((uu-UU)*(uu-UU)+4*su*su)+0.0001; // co/si norm
414  cu = (uu-UU)/nn; et=uu+UU; // rotation cos(2p) and sin(2p)
415  uu = sqrt((et+nn)/2); UU = et>nn?sqrt((et-nn)/2):0; // amplitude of first/second component
416  nn = su>0 ? 1 : -1; // norm^2 of 2*cos^2 and 2*sin*cos
417  su = sqrt((1-cu)/2); cu = nn*sqrt((1+cu)/2); // normalized rotation sin/cos
418 
419 // second packet
420  nn = sqrt((vv-VV)*(vv-VV)+4*sv*sv)+0.0001; // co/si norm
421  cv = (vv-VV)/nn; ET=vv+VV; // rotation cos(2p) and sin(2p)
422  vv = sqrt((ET+nn)/2); VV = et>nn?sqrt((ET-nn)/2):0; // first/second component energy
423  nn = sv>0 ? 1 : -1; // norm^2 of 2*cos^2 and 2*sin*cos
424  sv = sqrt((1-cv)/2); cv = nn*sqrt((1+cv)/2); // normalized rotation sin/cos// first packet
425 
426  //return _mm_set_ps(ET,et,VV+vv,UU+uu); // reversed order
427  return _wat_hsum(_NN); // returns number of pixels above threshold
428 }
429 
430 
431 static inline void _avx_pol_ps(float** p, float ** q,
432  wavearray<double>* pol00, wavearray<double>* pol90,
433  std::vector<float*> &pAPN,
434  std::vector<float*> &pAVX, int II) {
435 // calculates the polar coordinates of the input vector v in the DPF frame
436 // p,q - input/output - data vector
437 // pol00 - output - 00 component in polar coordinates (pol00[0] : radius, pol00[1] : angle in radians)
438 // pol90 - output - 90 component in polar coordinates (pol90[0] : radius, pol90[1] : angle in radians)
439 // pRMS - vector with noise rms data
440 // pAVX - pixel statistics
441 // II - number of AVX pixels
442 // in likelihoodWP these arrays should be stored exactly in the same order.
443 
444  int I = abs(II);
445 
446  __m128* _MK = (__m128*)pAVX[1];
447  __m128* _fp = (__m128*)pAVX[2];
448  __m128* _fx = (__m128*)pAVX[3];
449 
450  __m128 _xp,_XP,_xx,_XX,_rr,_RR,_mk;
451 
452  static const __m128 _0 = _mm_set1_ps(0);
453  static const __m128 _1 = _mm_set1_ps(1);
454  static const __m128 _o = _mm_set1_ps(1.e-5);
455 
456  __m128 _ss,_cc,_SS,_CC;
457  float rpol[4],cpol[4],spol[4];
458  float RPOL[4],CPOL[4],SPOL[4];
459 
460  double* r = pol00[0].data;
461  double* a = pol00[1].data;
462  double* R = pol90[0].data;
463  double* A = pol90[1].data;
464 
465  // pointers to antenna patterns
466  NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
467  __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
468  __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
469  __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
470  __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
471  __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
472  __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
473  __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
474 
475  // pointers to data
476  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
477  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
478  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
479  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
480  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
481  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
482  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
483  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
484 
485  int m=0;
486  for(int i=0; i<I; i+=4) {
487 
488 // Compute scalar products
489 
490  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1); // event mask - apply energy threshold En
491 
492  NETX(
493  _xp = _mm_mul_ps(*_f0,_mm_mul_ps(_mk,*_p0)); // (x,f+)
494  _XP = _mm_mul_ps(*_f0,_mm_mul_ps(_mk,*_q0)); // (X,f+)
495  _xx = _mm_mul_ps(*_F0,_mm_mul_ps(_mk,*_p0)); // (x,fx)
496  _XX = _mm_mul_ps(*_F0,_mm_mul_ps(_mk,*_q0)); , // (X,fx)
497 
498  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,_mm_mul_ps(_mk,*_p1))); // (x,f+)
499  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,_mm_mul_ps(_mk,*_q1))); // (X,f+)
500  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,_mm_mul_ps(_mk,*_p1))); // (x,fx)
501  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,_mm_mul_ps(_mk,*_q1))); , // (X,fx)
502 
503  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,_mm_mul_ps(_mk,*_p2))); // (x,f+)
504  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,_mm_mul_ps(_mk,*_q2))); // (X,f+)
505  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,_mm_mul_ps(_mk,*_p2))); // (x,fx)
506  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,_mm_mul_ps(_mk,*_q2))); , // (X,fx)
507 
508  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,_mm_mul_ps(_mk,*_p3))); // (x,f+)
509  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,_mm_mul_ps(_mk,*_q3))); // (X,f+)
510  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,_mm_mul_ps(_mk,*_p3))); // (x,fx)
511  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,_mm_mul_ps(_mk,*_q3))); , // (X,fx)
512 
513  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,_mm_mul_ps(_mk,*_p4))); // (x,f+)
514  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,_mm_mul_ps(_mk,*_q4))); // (X,f+)
515  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,_mm_mul_ps(_mk,*_p4))); // (x,fx)
516  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,_mm_mul_ps(_mk,*_q4))); , // (X,fx)
517 
518  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,_mm_mul_ps(_mk,*_p5))); // (x,f+)
519  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,_mm_mul_ps(_mk,*_q5))); // (X,f+)
520  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,_mm_mul_ps(_mk,*_p5))); // (x,fx)
521  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,_mm_mul_ps(_mk,*_q5))); , // (X,fx)
522 
523  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,_mm_mul_ps(_mk,*_p6))); // (x,f+)
524  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,_mm_mul_ps(_mk,*_q6))); // (X,f+)
525  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,_mm_mul_ps(_mk,*_p6))); // (x,fx)
526  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,_mm_mul_ps(_mk,*_q6))); , // (X,fx)
527 
528  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,_mm_mul_ps(_mk,*_p7))); // (x,f+)
529  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,_mm_mul_ps(_mk,*_q7))); // (X,f+)
530  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,_mm_mul_ps(_mk,*_p7))); // (x,fx)
531  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,_mm_mul_ps(_mk,*_q7))); ) // (X,fx)
532 
533 // 00/90 components in polar coordinates (pol00/90[0] : radius, pol00/90[1] : angle in radians)
534 
535  _cc = _mm_div_ps(_xp,_mm_add_ps(_mm_sqrt_ps(*_fp),_o)); // (x,f+) / {|f+|+epsilon}
536  _ss = _mm_div_ps(_xx,_mm_add_ps(_mm_sqrt_ps(*_fx),_o)); // (x,fx) / {|fx|+epsilon}
537 
538  _rr = _mm_div_ps(_mm_mul_ps(_xp,_xp),_mm_add_ps(*_fp,_o));
539  _rr = _mm_add_ps(_rr,_mm_div_ps(_mm_mul_ps(_xx,_xx),_mm_add_ps(*_fx,_o)));
540 
541  _mm_storeu_ps(cpol,_cc); // cos
542  _mm_storeu_ps(spol,_ss); // sin
543  _mm_storeu_ps(rpol,_rr); // (x,x);
544 
545  _CC = _mm_div_ps(_XP,_mm_add_ps(_mm_sqrt_ps(*_fp),_o)); // (X,f+) / {|f+|+epsilon}
546  _SS = _mm_div_ps(_XX,_mm_add_ps(_mm_sqrt_ps(*_fx),_o)); // (X,fx) / {|fx|+epsilon}
547 
548  _RR = _mm_div_ps(_mm_mul_ps(_XP,_XP),_mm_add_ps(*_fp,_o));
549  _RR = _mm_add_ps(_RR,_mm_div_ps(_mm_mul_ps(_XX,_XX),_mm_add_ps(*_fx,_o)));
550 
551  _mm_storeu_ps(CPOL,_CC); // cos
552  _mm_storeu_ps(SPOL,_SS); // sin
553  _mm_storeu_ps(RPOL,_RR); // (X,X);
554 
555  for(int n=0;n<4;n++) {
556  r[m] = sqrt(rpol[n]); // |x|
557  a[m] = atan2(spol[n],cpol[n]); // atan2(spol,cpol)
558  R[m] = sqrt(RPOL[n]); // |X|
559  A[m] = atan2(SPOL[n],CPOL[n]); // atan2(SPOL,CPOL)
560  m++;
561  }
562 
563 // PnP - Projection to network Plane
564 
565  _cc = _mm_div_ps(_cc,_mm_add_ps(_mm_sqrt_ps(*_fp),_o)); // (x,f+) / {|f+|^2+epsilon}
566  _ss = _mm_div_ps(_ss,_mm_add_ps(_mm_sqrt_ps(*_fx),_o)); // (x,fx) / {|fx|^2+epsilon}
567 
568  _CC = _mm_div_ps(_CC,_mm_add_ps(_mm_sqrt_ps(*_fp),_o)); // (X,f+) / {|f+|^2+epsilon}
569  _SS = _mm_div_ps(_SS,_mm_add_ps(_mm_sqrt_ps(*_fx),_o)); // (X,fx) / {|fx|^2+epsilon}
570 
571  NETX(*_p0 = _mm_add_ps(_mm_mul_ps(*_f0,_cc),_mm_mul_ps(*_F0,_ss));
572  *_q0 = _mm_add_ps(_mm_mul_ps(*_f0,_CC),_mm_mul_ps(*_F0,_SS)); ,
573  *_p1 = _mm_add_ps(_mm_mul_ps(*_f1,_cc),_mm_mul_ps(*_F1,_ss));
574  *_q1 = _mm_add_ps(_mm_mul_ps(*_f1,_CC),_mm_mul_ps(*_F1,_SS)); ,
575  *_p2 = _mm_add_ps(_mm_mul_ps(*_f2,_cc),_mm_mul_ps(*_F2,_ss));
576  *_q2 = _mm_add_ps(_mm_mul_ps(*_f2,_CC),_mm_mul_ps(*_F2,_SS)); ,
577  *_p3 = _mm_add_ps(_mm_mul_ps(*_f3,_cc),_mm_mul_ps(*_F3,_ss));
578  *_q3 = _mm_add_ps(_mm_mul_ps(*_f3,_CC),_mm_mul_ps(*_F3,_SS)); ,
579  *_p4 = _mm_add_ps(_mm_mul_ps(*_f4,_cc),_mm_mul_ps(*_F4,_ss));
580  *_q4 = _mm_add_ps(_mm_mul_ps(*_f4,_CC),_mm_mul_ps(*_F4,_SS)); ,
581  *_p5 = _mm_add_ps(_mm_mul_ps(*_f5,_cc),_mm_mul_ps(*_F5,_ss));
582  *_q5 = _mm_add_ps(_mm_mul_ps(*_f5,_CC),_mm_mul_ps(*_F5,_SS)); ,
583  *_p6 = _mm_add_ps(_mm_mul_ps(*_f6,_cc),_mm_mul_ps(*_F6,_ss));
584  *_q6 = _mm_add_ps(_mm_mul_ps(*_f6,_CC),_mm_mul_ps(*_F6,_SS)); ,
585  *_p7 = _mm_add_ps(_mm_mul_ps(*_f7,_cc),_mm_mul_ps(*_F7,_ss));
586  *_q7 = _mm_add_ps(_mm_mul_ps(*_f7,_CC),_mm_mul_ps(*_F7,_SS)); )
587 
588 // DSP - Dual Stream Phase Transform
589 
590  __m128 _N = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(_cc,_cc),_mm_mul_ps(_CC,_CC)));
591 
592  _cc = _mm_div_ps(_cc,_mm_add_ps(_N,_o)); // cos_dsp = N * (x,f+)/|f+|^2
593  _CC = _mm_div_ps(_CC,_mm_add_ps(_N,_o)); // sin_dsp = N * (X,f+)/|f+|^2
594 
595  __m128 _y,_Y;
596 
597  NETX(_y = _mm_add_ps(_mm_mul_ps(*_p0,_cc),_mm_mul_ps(*_q0,_CC));
598  _Y = _mm_sub_ps(_mm_mul_ps(*_q0,_cc),_mm_mul_ps(*_p0,_CC)); *_p0 = _y; *_q0 = _Y; ,
599  _y = _mm_add_ps(_mm_mul_ps(*_p1,_cc),_mm_mul_ps(*_q1,_CC));
600  _Y = _mm_sub_ps(_mm_mul_ps(*_q1,_cc),_mm_mul_ps(*_p1,_CC)); *_p1 = _y; *_q1 = _Y; ,
601  _y = _mm_add_ps(_mm_mul_ps(*_p2,_cc),_mm_mul_ps(*_q2,_CC));
602  _Y = _mm_sub_ps(_mm_mul_ps(*_q2,_cc),_mm_mul_ps(*_p2,_CC)); *_p2 = _y; *_q2 = _Y; ,
603  _y = _mm_add_ps(_mm_mul_ps(*_p3,_cc),_mm_mul_ps(*_q3,_CC));
604  _Y = _mm_sub_ps(_mm_mul_ps(*_q3,_cc),_mm_mul_ps(*_p3,_CC)); *_p3 = _y; *_q3 = _Y; ,
605  _y = _mm_add_ps(_mm_mul_ps(*_p4,_cc),_mm_mul_ps(*_q4,_CC));
606  _Y = _mm_sub_ps(_mm_mul_ps(*_q4,_cc),_mm_mul_ps(*_p4,_CC)); *_p4 = _y; *_q4 = _Y; ,
607  _y = _mm_add_ps(_mm_mul_ps(*_p5,_cc),_mm_mul_ps(*_q5,_CC));
608  _Y = _mm_sub_ps(_mm_mul_ps(*_q5,_cc),_mm_mul_ps(*_p5,_CC)); *_p5 = _y; *_q5 = _Y; ,
609  _y = _mm_add_ps(_mm_mul_ps(*_p6,_cc),_mm_mul_ps(*_q6,_CC));
610  _Y = _mm_sub_ps(_mm_mul_ps(*_q6,_cc),_mm_mul_ps(*_p6,_CC)); *_p6 = _y; *_q6 = _Y; ,
611  _y = _mm_add_ps(_mm_mul_ps(*_p7,_cc),_mm_mul_ps(*_q7,_CC));
612  _Y = _mm_sub_ps(_mm_mul_ps(*_q7,_cc),_mm_mul_ps(*_p7,_CC)); *_p7 = _y; *_q7 = _Y; )
613 
614 
615 // Increment pointers
616 
617  NETX(
618  _p0++;_q0++;_f0++;_F0++; ,
619  _p1++;_q1++;_f1++;_F1++; ,
620  _p2++;_q2++;_f2++;_F2++; ,
621  _p3++;_q3++;_f3++;_F3++; ,
622  _p4++;_q4++;_f4++;_F4++; ,
623  _p5++;_q5++;_f5++;_F5++; ,
624  _p6++;_q6++;_f6++;_F6++; ,
625  _p7++;_q7++;_f7++;_F7++; )
626 
627  _fp++;_fx++;
628  }
629 
630  return;
631 }
632 
633 
634 static inline float _avx_loadata_ps(float** p, float** q,
635  float** u, float** v, float En,
636  std::vector<float*> &pAVX, int I) {
637 // load data vectors p,q into temporary arrays u,v
638 // calculate total energy of network pixels stored in pAVX[5]
639 // apply energy threshold En, initialize pixel mask stored in pAVX[24]
640 // returns total energy
641 // in likelihoodWP these arrays should be stored exactly in the same order.
642 // this function increments pointers stored in tne input pointer arrays.
643  __m128 _aa, _AA, _aA;
644 
645  static const __m128 _1 = _mm_set1_ps(1);
646  static const __m128 _o = _mm_set1_ps(1.e-12);
647  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
648  static const __m128 _En= _mm_set1_ps(En);
649 
650  __m128* _et = (__m128*)pAVX[0];
651  __m128* _mk = (__m128*)pAVX[1];
652  __m128 _ee = _mm_setzero_ps();
653  __m128 _EE = _mm_setzero_ps();
654  __m128 _NN = _mm_setzero_ps();
655 
656  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
657  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
658  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
659  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
660  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
661  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
662  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
663  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
664 
665  NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
666  __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
667  __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
668  __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
669  __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
670  __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
671  __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
672  __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
673 
674  for(int i=0; i<I; i+=4) {
675  NETX(
676  _aa = _mm_mul_ps(*_p0,*_p0); *_u0++ = *_p0++;
677  _AA = _mm_mul_ps(*_q0,*_q0); *_v0++ = *_q0++; ,
678  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1)); *_u1++ = *_p1++;
679  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1)); *_v1++ = *_q1++; ,
680  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2)); *_u2++ = *_p2++;
681  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2)); *_v2++ = *_q2++; ,
682  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3)); *_u3++ = *_p3++;
683  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3)); *_v3++ = *_q3++; ,
684  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4)); *_u4++ = *_p4++;
685  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4)); *_v4++ = *_q4++; ,
686  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5)); *_u5++ = *_p5++;
687  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5)); *_v5++ = *_q5++; ,
688  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6)); *_u6++ = *_p6++;
689  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6)); *_v6++ = *_q6++; ,
690  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7)); *_u7++ = *_p7++;
691  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7)); *_v7++ = *_q7++; )
692 
693 
694 // calculate data network orthogonalization sin and cos
695 
696  *_et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
697  *_mk = _mm_and_ps(_mm_cmpgt_ps(*_et,_En),_1); // 1 if et>En or 0 if et<En
698  _NN = _mm_add_ps(_NN,*_mk); // total energy threshold
699  _ee = _mm_add_ps(_ee,*_et); // total energy threshold
700  *_et = _mm_mul_ps(*_et,*_mk++); // apply En threshold
701  _EE = _mm_add_ps(_EE,*_et++); // total energy above threshold
702  }
703  pAVX[1][I+1] = _wat_hsum(_NN); // store number of pixels
704  return _wat_hsum(_EE)/2; // total energy in one quadrature
705 }
706 
707 static inline void _avx_loadNULL_ps(float** n, float** N,
708  float** d, float** D,
709  float** h, float** H, int I) {
710 // load NULL packet amplitudes for all detectors and pixels
711 // these amplitudes are used for reconstruction of data time searies
712 // now works only for <4 detector
713  NETX(__m128* _n0 = (__m128*)n[0]; __m128* _N0 = (__m128*)N[0]; ,
714  __m128* _n1 = (__m128*)n[1]; __m128* _N1 = (__m128*)N[1]; ,
715  __m128* _n2 = (__m128*)n[2]; __m128* _N2 = (__m128*)N[2]; ,
716  __m128* _n3 = (__m128*)n[3]; __m128* _N3 = (__m128*)N[3]; ,
717  __m128* _n4 = (__m128*)n[4]; __m128* _N4 = (__m128*)N[4]; ,
718  __m128* _n5 = (__m128*)n[5]; __m128* _N5 = (__m128*)N[5]; ,
719  __m128* _n6 = (__m128*)n[6]; __m128* _N6 = (__m128*)N[6]; ,
720  __m128* _n7 = (__m128*)n[7]; __m128* _N7 = (__m128*)N[7]; )
721 
722  NETX(__m128* _d0 = (__m128*)d[0]; __m128* _D0 = (__m128*)D[0]; ,
723  __m128* _d1 = (__m128*)d[1]; __m128* _D1 = (__m128*)D[1]; ,
724  __m128* _d2 = (__m128*)d[2]; __m128* _D2 = (__m128*)D[2]; ,
725  __m128* _d3 = (__m128*)d[3]; __m128* _D3 = (__m128*)D[3]; ,
726  __m128* _d4 = (__m128*)d[4]; __m128* _D4 = (__m128*)D[4]; ,
727  __m128* _d5 = (__m128*)d[5]; __m128* _D5 = (__m128*)D[5]; ,
728  __m128* _d6 = (__m128*)d[6]; __m128* _D6 = (__m128*)D[6]; ,
729  __m128* _d7 = (__m128*)d[7]; __m128* _D7 = (__m128*)D[7]; )
730 
731  NETX(__m128* _h0 = (__m128*)h[0]; __m128* _H0 = (__m128*)H[0]; ,
732  __m128* _h1 = (__m128*)h[1]; __m128* _H1 = (__m128*)H[1]; ,
733  __m128* _h2 = (__m128*)h[2]; __m128* _H2 = (__m128*)H[2]; ,
734  __m128* _h3 = (__m128*)h[3]; __m128* _H3 = (__m128*)H[3]; ,
735  __m128* _h4 = (__m128*)h[4]; __m128* _H4 = (__m128*)H[4]; ,
736  __m128* _h5 = (__m128*)h[5]; __m128* _H5 = (__m128*)H[5]; ,
737  __m128* _h6 = (__m128*)h[6]; __m128* _H6 = (__m128*)H[6]; ,
738  __m128* _h7 = (__m128*)h[7]; __m128* _H7 = (__m128*)H[7]; )
739 
740  for(int i=0; i<I; i+=4) {
741  NETX(*_n0++ = _mm_sub_ps(*_d0++,*_h0++); *_N0++ = _mm_sub_ps(*_D0++,*_H0++); ,
742  *_n1++ = _mm_sub_ps(*_d1++,*_h1++); *_N1++ = _mm_sub_ps(*_D1++,*_H1++); ,
743  *_n2++ = _mm_sub_ps(*_d2++,*_h2++); *_N2++ = _mm_sub_ps(*_D2++,*_H2++); ,
744  *_n3++ = _mm_sub_ps(*_d3++,*_h3++); *_N3++ = _mm_sub_ps(*_D3++,*_H3++); ,
745  *_n4++ = _mm_sub_ps(*_d4++,*_h4++); *_N4++ = _mm_sub_ps(*_D4++,*_H4++); ,
746  *_n5++ = _mm_sub_ps(*_d5++,*_h5++); *_N5++ = _mm_sub_ps(*_D5++,*_H5++); ,
747  *_n6++ = _mm_sub_ps(*_d6++,*_h6++); *_N6++ = _mm_sub_ps(*_D6++,*_H6++); ,
748  *_n7++ = _mm_sub_ps(*_d7++,*_h7++); *_N7++ = _mm_sub_ps(*_D7++,*_H7++); )
749  }
750  return;
751 }
752 
753 
754 static inline float _avx_packet_ps(float** p, float** q,
755  std::vector<float*> &pAVX, int I) {
756 // calculates packet rotation sin/cos, amplitudes and unit vectors
757 // initialize unit vector arrays stored in pAVX in format used in
758 // in likelihoodWP these arrays should be stored exactly in the same order.
759 // this function increments pointers stored in tne input pointer arrays.
760 
761  int II = abs(I)*2;
762  __m128 _a, _A, _aa, _AA, _aA, _x, _cc, _ss, _nn, _cn, _sn, _mk;
763 
764  static const __m128 _0 = _mm_set1_ps(0);
765  static const __m128 _1 = _mm_set1_ps(1);
766  static const __m128 _2 = _mm_set1_ps(2);
767  static const __m128 _o = _mm_set1_ps(0.0001);
768  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
769 
770  float a[8] __attribute__((aligned(32))); __m128* _am = (__m128*)a;
771  float A[8] __attribute__((aligned(32))); __m128* _AM = (__m128*)A;
772  float s[8] __attribute__((aligned(32))); __m128* _si = (__m128*)s;
773  float c[8] __attribute__((aligned(32))); __m128* _co = (__m128*)c;
774 
775  __m128* _MK = (__m128*)(pAVX[1]);
776  __m128 aa[8], AA[8], aA[8], si[8], co[8];
777  for(int i=0; i<8; i++) {
778  aa[i] = _mm_setzero_ps();
779  AA[i] = _mm_setzero_ps();
780  aA[i] = _mm_setzero_ps();
781  }
782 
783  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
784  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
785  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
786  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
787  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
788  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
789  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
790  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
791 
792  for(int i=0; i<I; i+=4) {
793  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1); // event mask
794  NETX(
795  aa[0] = _mm_add_ps(aa[0],_mm_mul_ps(_mm_mul_ps(*_p0,*_p0), _mk));
796  AA[0] = _mm_add_ps(AA[0],_mm_mul_ps(_mm_mul_ps(*_q0,*_q0), _mk));
797  aA[0] = _mm_add_ps(aA[0],_mm_mul_ps(_mm_mul_ps(*_p0++,*_q0++),_mk));,
798 
799  aa[1] = _mm_add_ps(aa[1],_mm_mul_ps(_mm_mul_ps(*_p1,*_p1), _mk));
800  AA[1] = _mm_add_ps(AA[1],_mm_mul_ps(_mm_mul_ps(*_q1,*_q1), _mk));
801  aA[1] = _mm_add_ps(aA[1],_mm_mul_ps(_mm_mul_ps(*_p1++,*_q1++),_mk));,
802 
803  aa[2] = _mm_add_ps(aa[2],_mm_mul_ps(_mm_mul_ps(*_p2,*_p2), _mk));
804  AA[2] = _mm_add_ps(AA[2],_mm_mul_ps(_mm_mul_ps(*_q2,*_q2), _mk));
805  aA[2] = _mm_add_ps(aA[2],_mm_mul_ps(_mm_mul_ps(*_p2++,*_q2++),_mk));,
806 
807  aa[3] = _mm_add_ps(aa[3],_mm_mul_ps(_mm_mul_ps(*_p3,*_p3), _mk));
808  AA[3] = _mm_add_ps(AA[3],_mm_mul_ps(_mm_mul_ps(*_q3,*_q3), _mk));
809  aA[3] = _mm_add_ps(aA[3],_mm_mul_ps(_mm_mul_ps(*_p3++,*_q3++),_mk));,
810 
811  aa[4] = _mm_add_ps(aa[4],_mm_mul_ps(_mm_mul_ps(*_p4,*_p4), _mk));
812  AA[4] = _mm_add_ps(AA[4],_mm_mul_ps(_mm_mul_ps(*_q4,*_q4), _mk));
813  aA[4] = _mm_add_ps(aA[4],_mm_mul_ps(_mm_mul_ps(*_p4++,*_q4++),_mk));,
814 
815  aa[5] = _mm_add_ps(aa[5],_mm_mul_ps(_mm_mul_ps(*_p5,*_p5), _mk));
816  AA[5] = _mm_add_ps(AA[5],_mm_mul_ps(_mm_mul_ps(*_q5,*_q5), _mk));
817  aA[5] = _mm_add_ps(aA[5],_mm_mul_ps(_mm_mul_ps(*_p5++,*_q5++),_mk));,
818 
819  aa[6] = _mm_add_ps(aa[6],_mm_mul_ps(_mm_mul_ps(*_p6,*_p6), _mk));
820  AA[6] = _mm_add_ps(AA[6],_mm_mul_ps(_mm_mul_ps(*_q6,*_q6), _mk));
821  aA[6] = _mm_add_ps(aA[6],_mm_mul_ps(_mm_mul_ps(*_p6++,*_q6++),_mk));,
822 
823  aa[7] = _mm_add_ps(aa[7],_mm_mul_ps(_mm_mul_ps(*_p7,*_p7), _mk));
824  AA[7] = _mm_add_ps(AA[7],_mm_mul_ps(_mm_mul_ps(*_q7,*_q7), _mk));
825  aA[7] = _mm_add_ps(aA[7],_mm_mul_ps(_mm_mul_ps(*_p7++,*_q7++),_mk));)
826  }
827 
828 // packet amplitudes and sin/cos for detectors 0-3
829  _a = _mm_hadd_ps(aa[0],aa[1]); _A = _mm_hadd_ps(aa[2],aa[3]); _aa = _mm_hadd_ps(_a,_A);
830  _a = _mm_hadd_ps(AA[0],AA[1]); _A = _mm_hadd_ps(AA[2],AA[3]); _AA = _mm_hadd_ps(_a,_A);
831  _a = _mm_hadd_ps(aA[0],aA[1]); _A = _mm_hadd_ps(aA[2],aA[3]); _aA = _mm_hadd_ps(_a,_A);
832 
833  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
834  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
835  _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
836  _cc = _mm_mul_ps(*_co,*_co);
837  _ss = _mm_mul_ps(*_si,*_si);
838  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
839  *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2)); // first component amplitude
840  *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2); // second component energy
841  *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM)); // make sure |AM|>0
842  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
843  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
844  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
845  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
846  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
847  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
848 
849 // packet amplitudes and sin/cos for detectors 4-7
850  _am++; _AM++; _si++; _co++;
851  _a = _mm_hadd_ps(aa[4],aa[5]); _A = _mm_hadd_ps(aa[6],aa[7]); _aa = _mm_hadd_ps(_a,_A);
852  _a = _mm_hadd_ps(AA[4],AA[5]); _A = _mm_hadd_ps(AA[6],AA[7]); _AA = _mm_hadd_ps(_a,_A);
853  _a = _mm_hadd_ps(aA[4],aA[5]); _A = _mm_hadd_ps(aA[6],aA[7]); _aA = _mm_hadd_ps(_a,_A);
854 
855  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
856  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
857  _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
858  _cc = _mm_mul_ps(*_co,*_co);
859  _ss = _mm_mul_ps(*_si,*_si);
860  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
861  *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2)); // first component amplitude
862  *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2); // second component energy
863  *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM)); // make sure |AM|>0
864  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
865  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
866  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
867  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
868  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
869  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
870 
871  //cout<<"packet1: "<<a[0]<<" "<<A[0]<<" "<<a[1]<<" "<<A[1]<<" "<<a[2]<<" "<<A[2]<<" ";
872  //cout<<((a[0]+A[0])*(a[0]+A[0])/2+(a[1]+A[1])*(a[1]+A[1])/2+(a[2]+A[2])*(a[2]+A[2])/2)<<"\n";
873 
874  NETX(q[0][II]=a[0]; q[0][II+1]=A[0]; q[0][II+2]=s[0]; q[0][II+3]=c[0]; q[0][II+5]=1;,
875  q[1][II]=a[1]; q[1][II+1]=A[1]; q[1][II+2]=s[1]; q[1][II+3]=c[1]; q[1][II+5]=1;,
876  q[2][II]=a[2]; q[2][II+1]=A[2]; q[2][II+2]=s[2]; q[2][II+3]=c[2]; q[2][II+5]=1;,
877  q[3][II]=a[3]; q[3][II+1]=A[3]; q[3][II+2]=s[3]; q[3][II+3]=c[3]; q[3][II+5]=1;,
878  q[4][II]=a[4]; q[4][II+1]=A[4]; q[4][II+2]=s[4]; q[4][II+3]=c[4]; q[4][II+5]=1;,
879  q[5][II]=a[5]; q[5][II+1]=A[5]; q[5][II+2]=s[5]; q[5][II+3]=c[5]; q[5][II+5]=1;,
880  q[6][II]=a[6]; q[6][II+1]=A[6]; q[6][II+2]=s[6]; q[6][II+3]=c[6]; q[6][II+5]=1;,
881  q[7][II]=a[7]; q[7][II+1]=A[7]; q[7][II+2]=s[7]; q[7][II+3]=c[7]; q[7][II+5]=1;)
882 
883  float Ep = 0;
884  NETX(q[0][II+4]=(a[0]+A[0]); Ep+=q[0][II+4]*q[0][II+4]/2; _p0 = (__m128*)p[0]; _q0 = (__m128*)q[0];,
885  q[1][II+4]=(a[1]+A[1]); Ep+=q[1][II+4]*q[1][II+4]/2; _p1 = (__m128*)p[1]; _q1 = (__m128*)q[1];,
886  q[2][II+4]=(a[2]+A[2]); Ep+=q[2][II+4]*q[2][II+4]/2; _p2 = (__m128*)p[2]; _q2 = (__m128*)q[2];,
887  q[3][II+4]=(a[3]+A[3]); Ep+=q[3][II+4]*q[3][II+4]/2; _p3 = (__m128*)p[3]; _q3 = (__m128*)q[3];,
888  q[4][II+4]=(a[4]+A[4]); Ep+=q[4][II+4]*q[4][II+4]/2; _p4 = (__m128*)p[4]; _q4 = (__m128*)q[4];,
889  q[5][II+4]=(a[5]+A[5]); Ep+=q[5][II+4]*q[5][II+4]/2; _p5 = (__m128*)p[5]; _q5 = (__m128*)q[5];,
890  q[6][II+4]=(a[6]+A[6]); Ep+=q[6][II+4]*q[6][II+4]/2; _p6 = (__m128*)p[6]; _q6 = (__m128*)q[6];,
891  q[7][II+4]=(a[7]+A[7]); Ep+=q[7][II+4]*q[7][II+4]/2; _p7 = (__m128*)p[7]; _q7 = (__m128*)q[7];)
892 
893  *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o)); _am--;
894  *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o));
895  *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o)); _AM--;
896  *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o));
897 
898  //cout<<"packet2: "<<a[0]<<" "<<A[0]<<" "<<a[1]<<" "<<A[1]<<" "<<a[2]<<" "<<A[2]<<" ";
899 
900 
901  NETX(aa[0]=_mm_set1_ps(a[0]); AA[0]=_mm_set1_ps(A[0]); si[0]=_mm_set1_ps(s[0]); co[0]=_mm_set1_ps(c[0]); ,
902  aa[1]=_mm_set1_ps(a[1]); AA[1]=_mm_set1_ps(A[1]); si[1]=_mm_set1_ps(s[1]); co[1]=_mm_set1_ps(c[1]); ,
903  aa[2]=_mm_set1_ps(a[2]); AA[2]=_mm_set1_ps(A[2]); si[2]=_mm_set1_ps(s[2]); co[2]=_mm_set1_ps(c[2]); ,
904  aa[3]=_mm_set1_ps(a[3]); AA[3]=_mm_set1_ps(A[3]); si[3]=_mm_set1_ps(s[3]); co[3]=_mm_set1_ps(c[3]); ,
905  aa[4]=_mm_set1_ps(a[4]); AA[4]=_mm_set1_ps(A[4]); si[4]=_mm_set1_ps(s[4]); co[4]=_mm_set1_ps(c[4]); ,
906  aa[5]=_mm_set1_ps(a[5]); AA[5]=_mm_set1_ps(A[5]); si[5]=_mm_set1_ps(s[5]); co[5]=_mm_set1_ps(c[5]); ,
907  aa[6]=_mm_set1_ps(a[6]); AA[6]=_mm_set1_ps(A[6]); si[6]=_mm_set1_ps(s[6]); co[6]=_mm_set1_ps(c[6]); ,
908  aa[7]=_mm_set1_ps(a[7]); AA[7]=_mm_set1_ps(A[7]); si[7]=_mm_set1_ps(s[7]); co[7]=_mm_set1_ps(c[7]); )
909 
910  _MK = (__m128*)(pAVX[1]);
911  for(int i=0; i<I; i+=4) { // calculate and store unit vector components
912  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1); // event mask
913  NETX(_a=_mm_add_ps(_mm_mul_ps(*_p0,co[0]),_mm_mul_ps(*_q0,si[0]));
914  _A=_mm_sub_ps(_mm_mul_ps(*_q0,co[0]),_mm_mul_ps(*_p0,si[0]));
915  *_p0++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[0]));
916  *_q0++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[0])); ,
917 
918  _a=_mm_add_ps(_mm_mul_ps(*_p1,co[1]),_mm_mul_ps(*_q1,si[1]));
919  _A=_mm_sub_ps(_mm_mul_ps(*_q1,co[1]),_mm_mul_ps(*_p1,si[1]));
920  *_p1++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[1]));
921  *_q1++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[1])); ,
922 
923  _a=_mm_add_ps(_mm_mul_ps(*_p2,co[2]),_mm_mul_ps(*_q2,si[2]));
924  _A=_mm_sub_ps(_mm_mul_ps(*_q2,co[2]),_mm_mul_ps(*_p2,si[2]));
925  *_p2++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[2]));
926  *_q2++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[2])); ,
927 
928  _a=_mm_add_ps(_mm_mul_ps(*_p3,co[3]),_mm_mul_ps(*_q3,si[3]));
929  _A=_mm_sub_ps(_mm_mul_ps(*_q3,co[3]),_mm_mul_ps(*_p3,si[3]));
930  *_p3++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[3]));
931  *_q3++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[3])); ,
932 
933  _a=_mm_add_ps(_mm_mul_ps(*_p4,co[4]),_mm_mul_ps(*_q4,si[4]));
934  _A=_mm_sub_ps(_mm_mul_ps(*_q4,co[4]),_mm_mul_ps(*_p4,si[4]));
935  *_p4++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[4]));
936  *_q4++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[4])); ,
937 
938  _a=_mm_add_ps(_mm_mul_ps(*_p5,co[5]),_mm_mul_ps(*_q5,si[5]));
939  _A=_mm_sub_ps(_mm_mul_ps(*_q5,co[5]),_mm_mul_ps(*_p5,si[5]));
940  *_p5++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[5]));
941  *_q5++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[5])); ,
942 
943  _a=_mm_add_ps(_mm_mul_ps(*_p6,co[6]),_mm_mul_ps(*_q6,si[6]));
944  _A=_mm_sub_ps(_mm_mul_ps(*_q6,co[6]),_mm_mul_ps(*_p6,si[6]));
945  *_p6++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[6]));
946  *_q6++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[6])); ,
947 
948  _a=_mm_add_ps(_mm_mul_ps(*_p7,co[7]),_mm_mul_ps(*_q7,si[7]));
949  _A=_mm_sub_ps(_mm_mul_ps(*_q7,co[7]),_mm_mul_ps(*_p7,si[7]));
950  *_p7++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[7]));
951  *_q7++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[7])); )
952  }
953  return Ep/2; // returm packet energy p[er quadrature
954 }
955 
956 static inline float _avx_setAMP_ps(float** p, float** q,
957  std::vector<float*> &pAVX, int I) {
958 // set packet amplitudes for waveform reconstruction
959 // returns number of degrees of freedom - effective # of pixels per detector
960  int II = I*2;
961  int I2 = II+2;
962  int I3 = II+3;
963  int I4 = II+4;
964  int I5 = II+5;
965  float k = pAVX[1][I]; // number of detectors
966 
967  __m128 _a, _A, _n, _mk, _nn;
968  static const __m128 _o = _mm_set1_ps(1.e-9);
969  static const __m128 _0 = _mm_set1_ps(0);
970  static const __m128 _1 = _mm_set1_ps(1);
971  static const __m128 _4 = _mm_set1_ps(4);
972  static const __m128 o5 = _mm_set1_ps(0.5);
973 
974  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; __m128* _n0 = (__m128*)(q[0]+I); ,
975  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; __m128* _n1 = (__m128*)(q[1]+I); ,
976  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; __m128* _n2 = (__m128*)(q[2]+I); ,
977  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; __m128* _n3 = (__m128*)(q[3]+I); ,
978  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; __m128* _n4 = (__m128*)(q[4]+I); ,
979  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; __m128* _n5 = (__m128*)(q[5]+I); ,
980  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; __m128* _n6 = (__m128*)(q[6]+I); ,
981  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; __m128* _n7 = (__m128*)(q[7]+I); )
982 
983  NETX(__m128 a0=_mm_set1_ps(q[0][I4]); __m128 s0=_mm_set1_ps(q[0][I2]); __m128 c0=_mm_set1_ps(q[0][I3]); ,
984  __m128 a1=_mm_set1_ps(q[1][I4]); __m128 s1=_mm_set1_ps(q[1][I2]); __m128 c1=_mm_set1_ps(q[1][I3]); ,
985  __m128 a2=_mm_set1_ps(q[2][I4]); __m128 s2=_mm_set1_ps(q[2][I2]); __m128 c2=_mm_set1_ps(q[2][I3]); ,
986  __m128 a3=_mm_set1_ps(q[3][I4]); __m128 s3=_mm_set1_ps(q[3][I2]); __m128 c3=_mm_set1_ps(q[3][I3]); ,
987  __m128 a4=_mm_set1_ps(q[4][I4]); __m128 s4=_mm_set1_ps(q[4][I2]); __m128 c4=_mm_set1_ps(q[4][I3]); ,
988  __m128 a5=_mm_set1_ps(q[5][I4]); __m128 s5=_mm_set1_ps(q[5][I2]); __m128 c5=_mm_set1_ps(q[5][I3]); ,
989  __m128 a6=_mm_set1_ps(q[6][I4]); __m128 s6=_mm_set1_ps(q[6][I2]); __m128 c6=_mm_set1_ps(q[6][I3]); ,
990  __m128 a7=_mm_set1_ps(q[7][I4]); __m128 s7=_mm_set1_ps(q[7][I2]); __m128 c7=_mm_set1_ps(q[7][I3]); )
991 
992  __m128* _MK = (__m128*)pAVX[1];
993  __m128* _fp = (__m128*)pAVX[2];
994  __m128* _fx = (__m128*)pAVX[3];
995  __m128* _ee = (__m128*)pAVX[15];
996  __m128* _EE = (__m128*)pAVX[16];
997  __m128* _gn = (__m128*)pAVX[20];
998 
999  __m128 _Np = _mm_setzero_ps(); // number of effective pixels per detector
1000 
1001  for(int i=0; i<I; i+=4) { // packet amplitudes
1002  _mk = _mm_mul_ps(o5,_mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1)); // event mask
1003  NETX(
1004  _n = _mm_mul_ps(_mm_mul_ps(a0,_mk),*_n0); _a=*_p0; _A=*_q0; _nn=*_n0++;
1005  *_p0++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c0),_mm_mul_ps(_A,s0)));
1006  *_q0++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c0),_mm_mul_ps(_a,s0))); ,
1007  _n = _mm_mul_ps(_mm_mul_ps(a1,_mk),*_n1); _a=*_p1; _A=*_q1; _nn=_mm_add_ps(_nn,*_n1++);
1008  *_p1++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c1),_mm_mul_ps(_A,s1)));
1009  *_q1++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c1),_mm_mul_ps(_a,s1))); ,
1010  _n = _mm_mul_ps(_mm_mul_ps(a2,_mk),*_n2); _a=*_p2; _A=*_q2; _nn=_mm_add_ps(_nn,*_n2++);
1011  *_p2++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c2),_mm_mul_ps(_A,s2)));
1012  *_q2++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c2),_mm_mul_ps(_a,s2))); ,
1013  _n = _mm_mul_ps(_mm_mul_ps(a3,_mk),*_n3); _a=*_p3; _A=*_q3; _nn=_mm_add_ps(_nn,*_n3++);
1014  *_p3++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c3),_mm_mul_ps(_A,s3)));
1015  *_q3++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c3),_mm_mul_ps(_a,s3))); ,
1016  _n = _mm_mul_ps(_mm_mul_ps(a4,_mk),*_n4); _a=*_p4; _A=*_q4; _nn=_mm_add_ps(_nn,*_n4++);
1017  *_p4++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c4),_mm_mul_ps(_A,s4)));
1018  *_q4++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c4),_mm_mul_ps(_a,s4))); ,
1019  _n = _mm_mul_ps(_mm_mul_ps(a5,_mk),*_n5); _a=*_p5; _A=*_q5; _nn=_mm_add_ps(_nn,*_n5++);
1020  *_p5++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c5),_mm_mul_ps(_A,s5)));
1021  *_q5++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c5),_mm_mul_ps(_a,s5))); ,
1022  _n = _mm_mul_ps(_mm_mul_ps(a6,_mk),*_n6); _a=*_p6; _A=*_q6; _nn=_mm_add_ps(_nn,*_n6++);
1023  *_p6++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c6),_mm_mul_ps(_A,s6)));
1024  *_q6++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c6),_mm_mul_ps(_a,s6))); ,
1025  _n = _mm_mul_ps(_mm_mul_ps(a7,_mk),*_n7); _a=*_p7; _A=*_q7; _nn=_mm_add_ps(_nn,*_n7++);
1026  *_p7++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c7),_mm_mul_ps(_A,s7)));
1027  *_q7++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c7),_mm_mul_ps(_a,s7))); )
1028 
1029  _nn = _mm_mul_ps(_nn,_mk);
1030  _Np = _mm_add_ps(_Np,_nn); // Dof * k/4
1031  }
1032  return _wat_hsum(_Np)*4/k;
1033 }
1034 
1035 static inline float _avx_ort_ps(float** p, float** q,
1036  std::vector<float*> &pAVX, int I) {
1037 // orthogonalize data vectors p and q
1038 // calculate norms of orthogonal vectors and rotation sin & cos
1039 // p/q - array of pointers to 00/90 phase detector data
1040 // returns signal energy
1041 // in likelihoodWP these arrays should be stored exactly in the same order.
1042 // this function increments pointers stored in tne input pointer arrays.
1043  __m128 _a,_A,_aa,_AA,_aA,_et,_cc,_ss,_nn,_cn,_sn,_mk;
1044 
1045  static const __m128 _0 = _mm_set1_ps(0);
1046  static const __m128 _1 = _mm_set1_ps(1);
1047  static const __m128 _2 = _mm_set1_ps(2);
1048  static const __m128 _o = _mm_set1_ps(1.e-21);
1049  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
1050 
1051  __m128* _MK = (__m128*)pAVX[1];
1052  __m128* _si = (__m128*)pAVX[4];
1053  __m128* _co = (__m128*)pAVX[5];
1054  __m128* _ee = (__m128*)pAVX[15];
1055  __m128* _EE = (__m128*)pAVX[16];
1056  __m128 _e = _mm_setzero_ps();
1057  __m128 _E = _mm_setzero_ps();
1058 
1059  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
1060  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
1061  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
1062  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
1063  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
1064  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
1065  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
1066  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
1067 
1068  for(int i=0; i<I; i+=4) {
1069  NETX(
1070  _aa=_mm_mul_ps(*_p0,*_p0);
1071  _AA=_mm_mul_ps(*_q0,*_q0);
1072  _aA=_mm_mul_ps(*_p0++,*_q0++); ,
1073 
1074  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1));
1075  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1));
1076  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p1++,*_q1++)); ,
1077 
1078  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2));
1079  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2));
1080  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p2++,*_q2++)); ,
1081 
1082  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3));
1083  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3));
1084  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p3++,*_q3++)); ,
1085 
1086  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4));
1087  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4));
1088  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p4++,*_q4++)); ,
1089 
1090  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5));
1091  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5));
1092  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p5++,*_q5++)); ,
1093 
1094  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6));
1095  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6));
1096  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p6++,*_q6++)); ,
1097 
1098  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7));
1099  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7));
1100  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p7++,*_q7++)); )
1101 
1102 // calculate data network orthogonalization sin and cos
1103 
1104  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1); // event mask
1105  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
1106  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
1107  _et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
1108  _cc = _mm_mul_ps(*_co,*_co);
1109  _ss = _mm_mul_ps(*_si,*_si);
1110  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
1111  *_ee = _mm_div_ps(_mm_add_ps(_et,_nn),_2); // first component energy
1112  *_EE = _mm_div_ps(_mm_sub_ps(_et,_nn),_2); // second component energy
1113  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
1114  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
1115  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
1116  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
1117  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
1118  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
1119 
1120  _e = _mm_add_ps(_e,_mm_mul_ps(_mk,*_ee));
1121  _E = _mm_add_ps(_E,_mm_mul_ps(_mk,*_EE));
1122  _si++; _co++; _MK++; _ee++; _EE++;
1123  }
1124  return _wat_hsum(_e)+_wat_hsum(_E);
1125 }
1126 
1127 
1128 static inline __m256 _avx_stat_ps(float** x, float** X,
1129  float** s, float** S,
1130  std::vector<float*> &pAVX, int I) {
1131 // returns coherent statistics in the format {cc,ec,ed,gn}
1132 // ei - incoherent energy: sum{s[i]^4}/|s|^2 + sum{S[i]^4}/|S|^2
1133 // ec - coherent energy: |s|^2+|S|^2 - ei
1134 // ed - energy disbalance: 0.5*sum_k{(x[k]*s[k]-s[k]*s[k])^2} * (s,s)/(x,s)^2
1135 // + its 90-degrees phase value
1136 // cc - reduced network correlation coefficient
1137 // p/q - pointers to data/signal amplitudes
1138 // nIFO - number of detectors.
1139 // I - number of pixels
1140  float k = pAVX[1][I]; // number of detectors
1141  __m128 _a,_A,_x,_X,_c,_C,_s,_S,_r,_R,_rr,_RR,_xs,_XS;
1142  __m128 _ll,_ei,_cc,_mk,_mm,_ss,_SS;
1143  static const __m128 _o = _mm_set1_ps(0.001);
1144  static const __m128 _0 = _mm_set1_ps(0);
1145  static const __m128 _1 = _mm_set1_ps(1);
1146  static const __m128 _2 = _mm_set1_ps(2);
1147  static const __m128 _k = _mm_set1_ps(2*(1-k));
1148  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
1149 
1150  __m128* _et = (__m128*)pAVX[0];
1151  __m128* _MK = (__m128*)pAVX[1];
1152  __m128* _si = (__m128*)pAVX[4];
1153  __m128* _co = (__m128*)pAVX[5];
1154  __m128* _ec = (__m128*)pAVX[19];
1155  __m128* _gn = (__m128*)pAVX[20];
1156  __m128* _ed = (__m128*)pAVX[21];
1157  __m128* _rn = (__m128*)pAVX[22];
1158 
1159  __m128 _LL = _mm_setzero_ps();
1160  __m128 _Lr = _mm_setzero_ps();
1161  __m128 _EC = _mm_setzero_ps();
1162  __m128 _GN = _mm_setzero_ps();
1163  __m128 _RN = _mm_setzero_ps();
1164  __m128 _NN = _mm_setzero_ps();
1165 
1166  NETX(__m128* _x0 = (__m128*)x[0]; __m128* _X0 = (__m128*)X[0]; ,
1167  __m128* _x1 = (__m128*)x[1]; __m128* _X1 = (__m128*)X[1]; ,
1168  __m128* _x2 = (__m128*)x[2]; __m128* _X2 = (__m128*)X[2]; ,
1169  __m128* _x3 = (__m128*)x[3]; __m128* _X3 = (__m128*)X[3]; ,
1170  __m128* _x4 = (__m128*)x[4]; __m128* _X4 = (__m128*)X[4]; ,
1171  __m128* _x5 = (__m128*)x[5]; __m128* _X5 = (__m128*)X[5]; ,
1172  __m128* _x6 = (__m128*)x[6]; __m128* _X6 = (__m128*)X[6]; ,
1173  __m128* _x7 = (__m128*)x[7]; __m128* _X7 = (__m128*)X[7]; )
1174 
1175  NETX(__m128* _s0 = (__m128*)s[0]; __m128* _S0 = (__m128*)S[0]; ,
1176  __m128* _s1 = (__m128*)s[1]; __m128* _S1 = (__m128*)S[1]; ,
1177  __m128* _s2 = (__m128*)s[2]; __m128* _S2 = (__m128*)S[2]; ,
1178  __m128* _s3 = (__m128*)s[3]; __m128* _S3 = (__m128*)S[3]; ,
1179  __m128* _s4 = (__m128*)s[4]; __m128* _S4 = (__m128*)S[4]; ,
1180  __m128* _s5 = (__m128*)s[5]; __m128* _S5 = (__m128*)S[5]; ,
1181  __m128* _s6 = (__m128*)s[6]; __m128* _S6 = (__m128*)S[6]; ,
1182  __m128* _s7 = (__m128*)s[7]; __m128* _S7 = (__m128*)S[7]; )
1183 
1184  for(int i=0; i<I; i+=4) {
1185  NETX(
1186  _s=_mm_add_ps(_mm_mul_ps(*_s0,*_co),_mm_mul_ps(*_S0,*_si)); _r=_mm_sub_ps(*_s0,*_x0);
1187  _x=_mm_add_ps(_mm_mul_ps(*_x0,*_co),_mm_mul_ps(*_X0,*_si)); _R=_mm_sub_ps(*_S0,*_X0);
1188  _S=_mm_sub_ps(_mm_mul_ps(*_S0++,*_co),_mm_mul_ps(*_s0++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_a;
1189  _X=_mm_sub_ps(_mm_mul_ps(*_X0++,*_co),_mm_mul_ps(*_x0++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_A;
1190  _c=_mm_mul_ps(_a,_a); _ss=_mm_mul_ps(_s,_s); _rr=_mm_mul_ps(_r,_r);
1191  _C=_mm_mul_ps(_A,_A); _SS=_mm_mul_ps(_S,_S); _RR=_mm_mul_ps(_R,_R); ,
1192 
1193  _s=_mm_add_ps(_mm_mul_ps(*_s1,*_co), _mm_mul_ps(*_S1,*_si)); _r=_mm_sub_ps(*_s1,*_x1);
1194  _x=_mm_add_ps(_mm_mul_ps(*_x1,*_co), _mm_mul_ps(*_X1,*_si)); _R=_mm_sub_ps(*_S1,*_X1);
1195  _S=_mm_sub_ps(_mm_mul_ps(*_S1++,*_co),_mm_mul_ps(*_s1++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1196  _X=_mm_sub_ps(_mm_mul_ps(*_X1++,*_co),_mm_mul_ps(*_x1++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1197  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1198  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1199 
1200  _s=_mm_add_ps(_mm_mul_ps(*_s2,*_co), _mm_mul_ps(*_S2,*_si)); _r=_mm_sub_ps(*_s2,*_x2);
1201  _x=_mm_add_ps(_mm_mul_ps(*_x2,*_co), _mm_mul_ps(*_X2,*_si)); _R=_mm_sub_ps(*_S2,*_X2);
1202  _S=_mm_sub_ps(_mm_mul_ps(*_S2++,*_co),_mm_mul_ps(*_s2++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1203  _X=_mm_sub_ps(_mm_mul_ps(*_X2++,*_co),_mm_mul_ps(*_x2++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1204  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1205  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1206 
1207  _s=_mm_add_ps(_mm_mul_ps(*_s3,*_co), _mm_mul_ps(*_S3,*_si)); _r=_mm_sub_ps(*_s3,*_x3);
1208  _x=_mm_add_ps(_mm_mul_ps(*_x3,*_co), _mm_mul_ps(*_X3,*_si)); _R=_mm_sub_ps(*_S3,*_X3);
1209  _S=_mm_sub_ps(_mm_mul_ps(*_S3++,*_co),_mm_mul_ps(*_s3++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1210  _X=_mm_sub_ps(_mm_mul_ps(*_X3++,*_co),_mm_mul_ps(*_x3++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1211  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1212  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1213 
1214  _s=_mm_add_ps(_mm_mul_ps(*_s4,*_co), _mm_mul_ps(*_S4,*_si)); _r=_mm_sub_ps(*_s4,*_x4);
1215  _x=_mm_add_ps(_mm_mul_ps(*_x4,*_co), _mm_mul_ps(*_X4,*_si)); _R=_mm_sub_ps(*_S4,*_X4);
1216  _S=_mm_sub_ps(_mm_mul_ps(*_S4++,*_co),_mm_mul_ps(*_s4++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1217  _X=_mm_sub_ps(_mm_mul_ps(*_X4++,*_co),_mm_mul_ps(*_x4++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1218  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1219  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1220 
1221  _s=_mm_add_ps(_mm_mul_ps(*_s5,*_co), _mm_mul_ps(*_S5,*_si)); _r=_mm_sub_ps(*_s5,*_x5);
1222  _x=_mm_add_ps(_mm_mul_ps(*_x5,*_co), _mm_mul_ps(*_X5,*_si)); _R=_mm_sub_ps(*_S5,*_X5);
1223  _S=_mm_sub_ps(_mm_mul_ps(*_S5++,*_co),_mm_mul_ps(*_s5++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1224  _X=_mm_sub_ps(_mm_mul_ps(*_X5++,*_co),_mm_mul_ps(*_x5++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1225  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1226  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1227 
1228  _s=_mm_add_ps(_mm_mul_ps(*_s6,*_co), _mm_mul_ps(*_S6,*_si)); _r=_mm_sub_ps(*_s6,*_x6);
1229  _x=_mm_add_ps(_mm_mul_ps(*_x6,*_co), _mm_mul_ps(*_X6,*_si)); _R=_mm_sub_ps(*_S6,*_X6);
1230  _S=_mm_sub_ps(_mm_mul_ps(*_S6++,*_co),_mm_mul_ps(*_s6++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1231  _X=_mm_sub_ps(_mm_mul_ps(*_X6++,*_co),_mm_mul_ps(*_x6++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1232  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1233  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1234 
1235  _s=_mm_add_ps(_mm_mul_ps(*_s7,*_co), _mm_mul_ps(*_S7,*_si)); _r=_mm_sub_ps(*_s7,*_x7);
1236  _x=_mm_add_ps(_mm_mul_ps(*_x7,*_co), _mm_mul_ps(*_X7,*_si)); _R=_mm_sub_ps(*_S7,*_X7);
1237  _S=_mm_sub_ps(_mm_mul_ps(*_S7++,*_co),_mm_mul_ps(*_s7++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1238  _X=_mm_sub_ps(_mm_mul_ps(*_X7++,*_co),_mm_mul_ps(*_x7++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1239  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1240  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); )
1241 
1242 
1243  _mk = _mm_and_ps(_mm_cmpge_ps(*_MK,_0),_1); // event mask
1244  _c = _mm_div_ps(_c,_mm_add_ps(_mm_mul_ps(_xs,_xs),_o)); // first component incoherent energy
1245  _C = _mm_div_ps(_C,_mm_add_ps(_mm_mul_ps(_XS,_XS),_o)); // second component incoherent energy
1246  _ll = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS)); // signal energy
1247  _ss = _mm_mul_ps(_ss,_mm_sub_ps(_1,_c)); // 00 coherent energy
1248  _SS = _mm_mul_ps(_SS,_mm_sub_ps(_1,_C)); // 90 coherent energy
1249  *_ec = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS)); // coherent energy
1250  *_gn = _mm_mul_ps(_mk,_mm_mul_ps(_2,*_MK)); // G-noise correction
1251  *_rn = _mm_mul_ps(_mk,_mm_add_ps(_rr,_RR)); // residual noise in TF domain
1252 
1253  _a = _mm_mul_ps(_2,_mm_andnot_ps(sm,*_ec)); // 2*|ec|
1254  _A = _mm_add_ps(*_rn,_mm_add_ps(_o,*_gn)); // NULL
1255  _cc = _mm_div_ps(*_ec,_mm_add_ps(_a,_A)); // correlation coefficient
1256  _Lr = _mm_add_ps(_Lr,_mm_mul_ps(_ll,_cc)); // reduced likelihood
1257  _mm = _mm_and_ps(_mm_cmpgt_ps(*_ec,_o),_1); // coherent energy mask
1258 
1259  _LL = _mm_add_ps(_LL, _ll); // total signal energy
1260  _GN = _mm_add_ps(_GN,*_gn); // total G-noise correction
1261  _EC = _mm_add_ps(_EC,*_ec); // total coherent energy
1262  _RN = _mm_add_ps(_RN,*_rn); // residual noise in TF domain
1263  _NN = _mm_add_ps(_NN, _mm); // number of pixel in TF domain with Ec>0
1264 
1265  _si++; _co++; _MK++; _ec++; _gn++; _ed++; _rn++;
1266  }
1267  float cc = _wat_hsum(_Lr)/(_wat_hsum(_LL)+0.001); // network correlation coefficient
1268  float ec = _wat_hsum(_EC); // total coherent energy
1269  float rn = _wat_hsum(_GN)+_wat_hsum(_RN); // total noise x 2
1270  float nn = _wat_hsum(_NN); // number of pixels
1271  float ch = rn/(k*nn+sqrt(nn))/2; // chi2 in TF domain
1272  return _mm256_set_ps(0,0,0,0,rn/2,nn,ec,2*cc); // reversed order
1273 }
1274 
1275 
1276 static inline __m256 _avx_noise_ps(float** p, float** q, std::vector<float*> &pAVX, int I) {
1277 // q - pointer to pixel norms
1278 // returns noise correction
1279 // I - number of pixels
1280  float k = pAVX[1][I]; // number of detectors
1281  __m128 _nx,_ns,_nm,_mk,_gg,_rc,_nn;
1282  __m128* _et = (__m128*)pAVX[0];
1283  __m128* _MK = (__m128*)pAVX[1];
1284  __m128* _ec = (__m128*)pAVX[19];
1285  __m128* _gn = (__m128*)pAVX[20];
1286  __m128* _rn = (__m128*)pAVX[22];
1287  __m128 _GN = _mm_setzero_ps();
1288  __m128 _RC = _mm_setzero_ps();
1289  __m128 _ES = _mm_setzero_ps();
1290  __m128 _EH = _mm_setzero_ps();
1291  __m128 _EC = _mm_setzero_ps();
1292  __m128 _SC = _mm_setzero_ps();
1293  __m128 _NC = _mm_setzero_ps();
1294  __m128 _NS = _mm_setzero_ps();
1295 
1296  static const __m128 _0 = _mm_set1_ps(0);
1297  static const __m128 o5 = _mm_set1_ps(0.5);
1298  static const __m128 _o = _mm_set1_ps(1.e-9);
1299  static const __m128 _1 = _mm_set1_ps(1);
1300  static const __m128 _2 = _mm_set1_ps(2);
1301  static const __m128 _k = _mm_set1_ps(1./k);
1302 
1303  NETX(__m128* _s0 = (__m128*)(p[0]+I);, __m128* _s1 = (__m128*)(p[1]+I);,
1304  __m128* _s2 = (__m128*)(p[2]+I);, __m128* _s3 = (__m128*)(p[3]+I);,
1305  __m128* _s4 = (__m128*)(p[4]+I);, __m128* _s5 = (__m128*)(p[5]+I);,
1306  __m128* _s6 = (__m128*)(p[6]+I);, __m128* _s7 = (__m128*)(p[7]+I);)
1307 
1308  NETX(__m128* _x0 = (__m128*)(q[0]+I);, __m128* _x1 = (__m128*)(q[1]+I);,
1309  __m128* _x2 = (__m128*)(q[2]+I);, __m128* _x3 = (__m128*)(q[3]+I);,
1310  __m128* _x4 = (__m128*)(q[4]+I);, __m128* _x5 = (__m128*)(q[5]+I);,
1311  __m128* _x6 = (__m128*)(q[6]+I);, __m128* _x7 = (__m128*)(q[7]+I);)
1312 
1313  for(int i=0; i<I; i+=4) {
1314  NETX(_ns =*_s0++;, _ns = _mm_add_ps(*_s1++,_ns); ,
1315  _ns = _mm_add_ps(*_s2++,_ns);, _ns = _mm_add_ps(*_s3++,_ns); ,
1316  _ns = _mm_add_ps(*_s4++,_ns);, _ns = _mm_add_ps(*_s5++,_ns); ,
1317  _ns = _mm_add_ps(*_s6++,_ns);, _ns = _mm_add_ps(*_s7++,_ns); )
1318 
1319  NETX(_nx =*_x0++;, _nx = _mm_add_ps(*_x1++,_nx); ,
1320  _nx = _mm_add_ps(*_x2++,_nx);, _nx = _mm_add_ps(*_x3++,_nx); ,
1321  _nx = _mm_add_ps(*_x4++,_nx);, _nx = _mm_add_ps(*_x5++,_nx); ,
1322  _nx = _mm_add_ps(*_x6++,_nx);, _nx = _mm_add_ps(*_x7++,_nx); )
1323 
1324  _ns = _mm_mul_ps(_ns,_k);
1325  _nx = _mm_mul_ps(_nx,_k);
1326  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1); // event mask
1327  _nm = _mm_and_ps(_mm_cmpgt_ps(_nx,_0),_1); // data norm mask
1328  _nm = _mm_mul_ps(_mk,_nm); // norm x event mask
1329  _EC = _mm_add_ps(_EC,_mm_mul_ps(_nm,*_ec)); // coherent energy
1330  _NC = _mm_add_ps(_NC,_nm); // number of core pixels
1331 
1332  _nm = _mm_sub_ps(_mk,_nm); // halo mask
1333  _ES = _mm_add_ps(_ES,_mm_mul_ps(_nm,*_rn)); // residual sattelite noise
1334 
1335  _rc = _mm_and_ps(_mm_cmplt_ps(*_gn,_2),_1); // rc=1 if gn<2, or rc=0 if gn>=2
1336  _nn = _mm_mul_ps(*_gn,_mm_sub_ps(_1,_rc)); // _nn = [_1 - _rc] * _gn
1337  _rc = _mm_add_ps(_rc,_mm_mul_ps(_nn,o5)); // Ec normalization
1338  _rc = _mm_div_ps(*_ec,_mm_add_ps(_rc,_o)); // normalized EC
1339 
1340  _nm = _mm_and_ps(_mm_cmpgt_ps(_ns,_0),_1); // signal norm mask
1341  _nm = _mm_mul_ps(_mk,_nm); // norm x event mask
1342  *_gn = _mm_mul_ps(_mk,_mm_mul_ps(*_gn,_nx)); // normalize Gaussian noise ecorrection
1343  _SC = _mm_add_ps(_SC,_mm_mul_ps(_nm,*_ec)); // signal coherent energy
1344  _RC = _mm_add_ps(_RC,_mm_mul_ps(_nm, _rc)); // total normalized EC
1345  _GN = _mm_add_ps(_GN,_mm_mul_ps(_nm,*_gn)); // G-noise correction in time domain
1346  _NS = _mm_add_ps(_NS,_nm); // number of signal pixels
1347 
1348  _nm = _mm_sub_ps(_mk,_nm); // satellite mask
1349  _EH = _mm_add_ps(_EH,_mm_mul_ps(_nm,*_et)); // halo energy in TF domain
1350 
1351  _MK++; _gn++; _et++; _ec++; _rn++;
1352  }
1353  float es = _wat_hsum(_ES)/2; // residual satellite energy in time domain
1354  float eh = _wat_hsum(_EH)/2; // halo energy in TF domain
1355  float gn = _wat_hsum(_GN); // G-noise correction
1356  float nc = _wat_hsum(_NC); // number of core pixels
1357  float ns = _wat_hsum(_NS); // number of signal pixels
1358  float rc = _wat_hsum(_RC); // normalized EC x 2
1359  float ec = _wat_hsum(_EC); // signal coherent energy x 2
1360  float sc = _wat_hsum(_SC); // core coherent energy x 2
1361  return _mm256_set_ps(ns,nc,es,eh,rc/(sc+0.01),sc-ec,ec,gn);
1362 }
1363 
1364 
1365 // vec.push_back(m[j]*(fp*(a+A)*(u[j]*cu-U[j]*su)/N[0] + fx*(b+B)*(V[j]*cv+v[j]*sv)/N[1]));
1366 // vec.push_back(m[j]*(fp*(a+A)*(U[j]*cu+u[j]*su)/N[0] + fx*(b+B)*(v[j]*cv-V[j]*sv)/N[1]));
1367 
1368 
1369 #endif // WATAVX_HH
1370 
1371 
1372 
1373 
1374 
1375 
1376 
1377 
1378 
1379 
1380 
1381 
1382 
1383 
1384 
1385 
1386 
static void _avx_pol_ps(float **p, float **q, wavearray< double > *pol00, wavearray< double > *pol90, std::vector< float *> &pAPN, std::vector< float *> &pAVX, int II)
Definition: watavx.hh:431
TCanvas * c2
wavearray< double > a(hp.size())
WSeries< float > v[nIFO]
Definition: cwb_net.C:80
int n
Definition: cwb_net.C:28
static float _avx_dpf_ps(double **Fp, double **Fx, int l, std::vector< float *> &pAPN, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:84
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\ layers : "<< nLAYERS<< "\ dF(hz) : "<< dF<< "\ dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1) *itime+ifreq;double time=itime *dT;double freq=(ifreq >0) ? ifreq *dF :dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static __m256 _avx_stat_ps(float **x, float **X, float **s, float **S, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:1128
int m
Definition: cwb_net.C:28
i drho i
#define N
cout<< "SNR "<< xsnr<< endl;wavearray< double > f
Definition: ComputeSNR.C:75
static float _avx_GW_ps(float **p, float **q, std::vector< float *> &pAPN, float *rr, std::vector< float *> &pAVX, int II)
Definition: watavx.hh:251
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _avx_packet_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:754
wavearray< double > h
Definition: Regression_H1.C:25
TCanvas * c1
double D[50000]
static void _avx_free_ps(std::vector< float *> &v)
Definition: watavx.hh:40
int l
static void _avx_loadNULL_ps(float **n, float **N, float **d, float **D, float **h, float **H, int I)
Definition: watavx.hh:707
int k
static double A
Definition: geodesics.cc:26
double e
static void _avx_cpf_ps(float **p, float **q, float **u, float **v, int I)
Definition: watavx.hh:49
static float _avx_ort_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:1035
regression r
Definition: Regression_H1.C:44
s s
Definition: cwb_net.C:155
static float _avx_setAMP_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:956
static double a2
Definition: geodesics.cc:26
static float _wat_hsum(__m128 y)
Definition: watavx.hh:44
Meyer< double > S(1024, 2)
static float _avx_loadata_ps(float **p, float **q, float **u, float **v, float En, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:634
DataType_t * data
Definition: wavearray.hh:319
wavearray< double > y
Definition: Test10.C:31
char c3[8]
static __m256 _avx_noise_ps(float **p, float **q, std::vector< float *> &pAVX, int I)
Definition: watavx.hh:1276