Logo coherent WaveBurst  
Library Reference Guide
Logo
watsse.hh
Go to the documentation of this file.
1 /*
2 # Copyright (C) 2019 Sergey Klimenko
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17 
18 
19 // Wavelet Analysis Tool
20 // S.Klimenko, University of Florida
21 // library of general functions
22 
23 #ifndef WATSSE_HH
24 #define WATSSE_HH
25 
26 #include <xmmintrin.h>
27 #include <pmmintrin.h>
28 #include <iostream>
29 #include <stdlib.h>
30 #include <math.h>
31 #include <TMath.h>
32 #include "wat.hh"
33 
34 // WAT SSE functions
35 
36 
37 static inline void _sse_print_ps(__m128* _p) {
38  float x[4];
39  _NET(_mm_storeu_ps(x,_p[0]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);,
40  _mm_storeu_ps(x,_p[1]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);)
41  cout<<endl;
42 }
43 
44 static inline void _sse_zero_ps(__m128* _p) {
45  _NET(_p[0] = _mm_setzero_ps();,
46  _p[1] = _mm_setzero_ps();)
47  return;
48 }
49 
50 static inline void _sse_load_ps(__m128* _p, float a) {
51  _NET(_p[0] = _mm_load1_ps(&a);,
52  _p[1] = _mm_load1_ps(&a);)
53  return;
54 }
55 
56 static inline void _sse_mul_ps(__m128* _a, float b) {
57  __m128 _b = _mm_load1_ps(&b);
58  _NET(_a[0] = _mm_mul_ps(_a[0],_b);,
59  _a[1] = _mm_mul_ps(_a[1],_b);)
60 }
61 
62 static inline void _sse_mul_ps(__m128* _a, __m128* _b) {
63  _NET(_a[0] = _mm_mul_ps(_a[0],_b[0]);,
64  _a[1] = _mm_mul_ps(_a[1],_b[1]);)
65 }
66 
67 static inline float _sse_mul_ps(__m128* _a, __m128* _b, __m128* _o) {
68  float x[4];
69  float out;
70  _NET(_o[0] = _mm_mul_ps(_a[0],_b[0]);_mm_storeu_ps(x,_o[0]); out = XSUM(x);,
71  _o[1] = _mm_mul_ps(_a[1],_b[1]);_mm_storeu_ps(x,_o[1]); out+= YSUM(x);)
72  return out;
73 }
74 
75 static inline void _sse_mul4_ps(__m128* _am, __m128 _c) {
76  float c[4]; _mm_storeu_ps(c,_c);
77  __m128* _a = _am;
78  _NET(_c=_mm_load1_ps( c );*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
79  _NET(_c=_mm_load1_ps(c+1);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
80  _NET(_c=_mm_load1_ps(c+2);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
81  _NET(_c=_mm_load1_ps(c+3);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
82 }
83 
84 static inline void _sse_hard4_ps(__m128* _uu, __m128* _am, __m128* _AM, __m128 _c) {
85 // calculate hard responses
86 // replace standard with hard responses if _c=0;
87 // uu - unity vector along f+
88 // am, AM - 00 and 90 phase responses.
89  float c[4]; _mm_storeu_ps(c,_c);
90  __m128* _u = _uu;
91  __m128* _a = _am;
92  __m128* _A = _AM;
93  __m128 _r, _R;
94  _NET(
95  _r = _mm_set1_ps(c[0]); _R = _mm_set1_ps(1-c[0]);
96  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
97  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
98  )
99  _NET(
100  _r = _mm_set1_ps(c[1]); _R = _mm_set1_ps(1-c[1]);
101  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
102  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
103  )
104  _NET(
105  _r = _mm_set1_ps(c[2]); _R = _mm_set1_ps(1-c[2]);
106  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
107  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
108  )
109  _NET(
110  _r = _mm_set1_ps(c[3]); _R = _mm_set1_ps(1-c[3]);
111  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
112  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
113  )
114 }
115 
116 static inline void _sse_ifcp4_ps(__m128* _aa, __m128* _bb, __m128 _c) {
117 // sabstutute vector _aa with _bb if _c=0;
118  float c[4]; _mm_storeu_ps(c,_c);
119  __m128* _a = _aa;
120  __m128* _b = _bb;
121  __m128 _1, _0;
122  _NET(_1 = _mm_set1_ps(c[0]); _0 = _mm_set1_ps(1-c[0]);
123  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
124  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
125  _NET(_1 = _mm_set1_ps(c[1]); _0 = _mm_set1_ps(1-c[1]);
126  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
127  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
128  _NET(_1 = _mm_set1_ps(c[2]); _0 = _mm_set1_ps(1-c[2]);
129  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
130  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
131  _NET(_1 = _mm_set1_ps(c[3]); _0 = _mm_set1_ps(1-c[3]);
132  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
133  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
134 }
135 
136 
137 static inline float _sse_abs_ps(__m128* _a) {
138  float x[4];
139  float out;
140  _NET(
141  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
142  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
143  )
144  return out;
145 }
146 
147 static inline float _sse_abs_ps(__m128* _a, __m128* _A) {
148  float x[4];
149  float out;
150  _NET(
151  _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]))); out = XSUM(x);,
152  _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]))); out+= YSUM(x);
153  )
154  return out;
155 }
156 
157 static inline __m128 _sse_abs4_ps(__m128* _p) {
158 // return |p|^2
159  float x[4];
160  float o[4];
161  __m128* _a = _p;
162  _NET(
163  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x);,
164  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
165  )
166  _NET(
167  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x);,
168  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
169  )
170  _NET(
171  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x);,
172  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
173  )
174  _NET(
175  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x);,
176  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
177  )
178  return _mm_load_ps(o);
179 }
180 
181 static inline __m128 _sse_div4_ps(__m128* _v, __m128* _u) {
182 // returns |v|/|u|
183  return _mm_sqrt_ps(_mm_div_ps(_sse_abs4_ps(_v),
184  _mm_add_ps(_sse_abs4_ps(_u),
185  _mm_set1_ps(1.e-24))));
186 }
187 
188 
189 static inline __m128 _sse_rnorm4_ps(__m128* _p) {
190 // return reciprocical norm: 1/|p|
191  float x[4];
192  float o[4];
193  __m128* _a = _p;
194  _NET(
195  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x)+1.e-24;,
196  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
197  )
198  _NET(
199  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x)+1.e-24;,
200  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
201  )
202  _NET(
203  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x)+1.e-24;,
204  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
205  )
206  _NET(
207  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x)+1.e-24;,
208  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
209  )
210  return _mm_div_ps(_mm_set1_ps(1.),_mm_sqrt_ps(_mm_load_ps(o)));
211 }
212 
213 static inline float _sse_dot_ps(__m128* _a, __m128* _b) {
214  float x[4];
215  float out;
216  _NET(
217  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_b[0])); out = XSUM(x);,
218  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_b[1])); out+= YSUM(x);
219  )
220  return out;
221 }
222 
223 static inline __m128 _sse_dot4_ps(__m128* _p, __m128* _q) {
224  float x[4];
225  float o[4];
226  __m128* _o = (__m128*) o;
227  __m128* _a = _p;
228  __m128* _b = _q;
229  _NET(
230  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0] = XSUM(x);,
231  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0]+= YSUM(x);
232  )
233  _NET(
234  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1] = XSUM(x);,
235  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1]+= YSUM(x);
236  )
237  _NET(
238  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2] = XSUM(x);,
239  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2]+= YSUM(x);
240  )
241  _NET(
242  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3] = XSUM(x);,
243  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3]+= YSUM(x);
244  )
245  return *_o;
246 }
247 
248 static inline void _sse_add_ps(__m128* _a, __m128* _b) {
249 // _a += _b
250  _NET(_a[0] = _mm_add_ps(_a[0],_b[0]);,
251  _a[1] = _mm_add_ps(_a[1],_b[1]);)
252  return;
253 }
254 
255 static inline void _sse_add_ps(__m128* _a, __m128* _b, __m128 _c) {
256 // _a += _b *_c
257  _NET(_a[0] = _mm_add_ps(_a[0],_mm_mul_ps(_b[0],_c));,
258  _a[1] = _mm_add_ps(_a[1],_mm_mul_ps(_b[1],_c));)
259  return;
260 }
261 
262 static inline void _sse_add4_ps(__m128* _a, __m128* _b, __m128 _c) {
263 // _a++ += _b++ *c[0]
264 // _a++ += _b++ *c[1]
265 // _a++ += _b++ *c[2]
266 // _a++ += _b++ *c[3]
267  float c[4]; _mm_storeu_ps(c,_c);
268  __m128* _p = _a;
269  __m128* _q = _b;
270  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
271  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
272  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
273  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
274  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
275  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
276  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
277  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
278  return;
279 }
280 
281 static inline void _sse_sub_ps(__m128* _a, __m128* _b) {
282 // _a -= _b
283  _NET(_a[0] = _mm_sub_ps(_a[0],_b[0]);,
284  _a[1] = _mm_sub_ps(_a[1],_b[1]);)
285  return;
286 }
287 
288 static inline void _sse_sub4_ps(__m128* _a, __m128* _b, __m128 _c) {
289 // _a++ -= _b++ *c[0]
290 // _a++ -= _b++ *c[1]
291 // _a++ -= _b++ *c[2]
292 // _a++ -= _b++ *c[3]
293  float c[4]; _mm_storeu_ps(c,_c);
294  __m128* _p = _a;
295  __m128* _q = _b;
296  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
297  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
298  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
299  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
300  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
301  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
302  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
303  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
304  return;
305 }
306 
307 static inline void _sse_cpf_ps(float* a, __m128* _p) {
308  _NET(_mm_storeu_ps(a,*_p);,
309  _mm_storeu_ps(a+4,*(_p+1));)
310  return;
311 }
312 
313 static inline void _sse_cpf_ps(__m128* _a, __m128* _p) {
314  _NET(*_a = *_p;, *(_a+1) = *(_p+1);)
315 }
316 
317 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp) {
318  __m128* _a = _aa;
319  __m128* _p = _pp;
320  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
321  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
322  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
323  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
324 }
325 
326 
327 static inline void _sse_cpf_ps(float* a, __m128* _p, float b) {
328  __m128 _b = _mm_load1_ps(&b);
329  _NET(_mm_storeu_ps(a,_mm_mul_ps(*_p,_b));,
330  _mm_storeu_ps(a+4,_mm_mul_ps(*(_p+1),_b));)
331  return;
332 }
333 
334 static inline void _sse_cpf4_ps(float* aa, __m128* _pp) {
335 // copy data from pp to aa for 4 consecutive pixels
336  float* a = aa;
337  __m128* _p = _pp;
338 
339  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
340  _mm_storeu_ps(a,*_p++); a+=4;)
341  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
342  _mm_storeu_ps(a,*_p++); a+=4;)
343  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
344  _mm_storeu_ps(a,*_p++); a+=4;)
345  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
346  _mm_storeu_ps(a,*_p++); a+=4;)
347 
348  return;
349 }
350 
351 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp, __m128 _c) {
352 // multiply p by c and copy data from p to a for 4 consecutive pixels
353 // calculate _a[0]=_p[0]*c[0]
354 // calculate _a[1]=_p[1]*c[1]
355 // calculate _a[2]=_p[2]*c[2]
356 // calculate _a[3]=_p[3]*c[3]
357  float c[4]; _mm_storeu_ps(c,_c);
358  __m128* _a = _aa;
359  __m128* _p = _pp;
360 
361  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));,
362  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));)
363  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));,
364  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));)
365  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));,
366  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));)
367  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));,
368  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));)
369  return;
370 }
371 
372 static inline float _sse_nrg_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
373 // calculate b = (a - u*c - v*s) and return b*b
374  float x[4];
375  float out;
376  __m128 _b;
377  __m128 _c = _mm_load1_ps(&c);
378  __m128 _s = _mm_load1_ps(&s);
379 
380  _NET(_b = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(*_u,_c), _mm_mul_ps(*_v,_s)));
381  _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out = XSUM(x);,
382  _b = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(*(_u+1),_c), _mm_mul_ps(*(_v+1), _s)));
383  _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out+= YSUM(x);)
384 
385  return out/2.;
386 }
387 
388 static inline void _sse_rotadd_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
389 // calculate a += u*c + v*s
390  __m128 _c = _mm_load1_ps(&c);
391  __m128 _s = _mm_load1_ps(&s);
392  _NET(
393  _a[0] = _mm_add_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));,
394  _a[1] = _mm_add_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1],_s)));
395  )
396  return;
397 }
398 
399 static inline float _sse_rotsub_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
400 // calculate a -= u*c + v*s and return a*a
401  float x[4];
402  float out;
403  __m128 _c = _mm_load1_ps(&c);
404  __m128 _s = _mm_load1_ps(&s);
405 
406  _NET(
407  _a[0] = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));
408  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
409  _a[1] = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1], _s)));
410  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
411  )
412 
413  return out;
414 }
415 
416 static inline void _sse_rotp_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {
417 // calculate a = u*c + v*s
418  _NET(
419  a[0] = _mm_add_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
420  a[1] = _mm_add_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
421  )
422 }
423 
424 static inline void _sse_rotm_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {
425 // calculate a = u*c - v*s
426  _NET(
427  a[0] = _mm_sub_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
428  a[1] = _mm_sub_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
429  )
430 }
431 
432 static inline __m128 _sse_rotp_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
433 // return u*c + v*s
434  return _mm_add_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
435 }
436 
437 static inline __m128 _sse_rotm_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
438 // return u*c - v*s
439  return _mm_sub_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
440 }
441 
442 static inline void _sse_rot4p_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
443 // calculate a[0] = u[0]*c[0] + v[0]*s[0]
444 // calculate a[1] = u[1]*c[1] + v[1]*s[1]
445 // calculate a[2] = u[2]*c[2] + v[2]*s[2]
446 // calculate a[3] = u[3]*c[3] + v[3]*s[3]
447  float c[4];
448  float s[4];
449  _mm_storeu_ps(c,*_c);
450  _mm_storeu_ps(s,*_s);
451  __m128* u = _u;
452  __m128* v = _v;
453  __m128* a = _a;
454  _NET(
455  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
456  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
457  )
458  _NET(
459  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
460  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
461  )
462  _NET(
463  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
464  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
465  )
466  _NET(
467  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
468  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
469  )
470 }
471 
472 static inline void _sse_rot4m_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
473 // calculate a[0] = u[0]*c[0] - v[0]*s[0]
474 // calculate a[1] = u[1]*c[1] - v[1]*s[1]
475 // calculate a[2] = u[2]*c[2] - v[2]*s[2]
476 // calculate a[3] = u[3]*c[3] - v[3]*s[3]
477  float c[4];
478  float s[4];
479  _mm_storeu_ps(c,*_c);
480  _mm_storeu_ps(s,*_s);
481  __m128* u = _u;
482  __m128* v = _v;
483  __m128* a = _a;
484  _NET(
485  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
486  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
487  )
488  _NET(
489  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
490  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
491  )
492  _NET(
493  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
494  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
495  )
496  _NET(
497  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
498  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
499  )
500 }
501 
502 static inline void _sse_point_ps(__m128** _p, float** p, short** m, int l, int n) {
503 // point 0-7 __m128 pointers to first network pixel
504  NETX(_p[0] = (__m128*) (p[0] + m[0][l]*n);,
505  _p[1] = (__m128*) (p[1] + m[1][l]*n);,
506  _p[2] = (__m128*) (p[2] + m[2][l]*n);,
507  _p[3] = (__m128*) (p[3] + m[3][l]*n);,
508  _p[4] = (__m128*) (p[4] + m[4][l]*n);,
509  _p[5] = (__m128*) (p[5] + m[5][l]*n);,
510  _p[6] = (__m128*) (p[6] + m[6][l]*n);,
511  _p[7] = (__m128*) (p[7] + m[7][l]*n);)
512  return;
513 }
514 
515 static inline __m128 _sse_sum_ps(__m128** _p) {
516  __m128 _q = _mm_setzero_ps();
517  NETX(_q = _mm_add_ps(_q, *_p[0]);,
518  _q = _mm_add_ps(_q, *_p[1]);,
519  _q = _mm_add_ps(_q, *_p[2]);,
520  _q = _mm_add_ps(_q, *_p[3]);,
521  _q = _mm_add_ps(_q, *_p[4]);,
522  _q = _mm_add_ps(_q, *_p[5]);,
523  _q = _mm_add_ps(_q, *_p[6]);,
524  _q = _mm_add_ps(_q, *_p[7]);)
525  return _q;
526 }
527 
528 static inline __m128 _sse_cut_ps(__m128* _pE, __m128** _pe, __m128 _Es, __m128 _cmp) {
529  NETX(_cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[0]++),_Es));,
530  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[1]++),_Es));,
531  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[2]++),_Es));,
532  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[3]++),_Es));,
533  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[4]++),_Es));,
534  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[5]++),_Es));,
535  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[6]++),_Es));,
536  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[7]++),_Es));)
537  return _cmp;
538 }
539 
540 static inline void _sse_minSNE_ps(__m128* _pE, __m128** _pe, __m128* _es) {
541 // put pixel minimum subnetwork energy in _es
542 // input _es should be initialized to _pE before call
543  NETX(*_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[0]++));,
544  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[1]++));,
545  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[2]++));,
546  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[3]++));,
547  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[4]++));,
548  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[5]++));,
549  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[6]++));,
550  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[7]++));
551  )
552 }
553 
554 static inline float _sse_maxE_ps(__m128* _a, __m128* _A) {
555 // given input 00 and 90 data vectors
556 // returns energy of dominant detector (max energy)
557  float out;
558  float x[4];
559  __m128 _o1;
560  __m128 _o2 = _mm_setzero_ps();
561  _NET(
562  _o1 = _mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]));,
563  _o2 = _mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]));
564  )
565  _o1 = _mm_max_ps(_o1,_o2); _mm_storeu_ps(x,_o1); out=x[0];
566  if(out<x[1]) out=x[1];
567  if(out<x[2]) out=x[2];
568  if(out<x[3]) out=x[3];
569  return out;
570 }
571 
572 static inline void _sse_ort4_ps(__m128* _u, __m128* _v, __m128* _s, __m128* _c) {
573 // orthogonalize vectors _u and _v: take vectors u and v,
574 // make them orthogonal, calculate rotation phase
575 // fill in sin and cos in _s and _c respectively
576  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
577  static const __m128 _o = _mm_set1_ps(1.e-24);
578  static const __m128 _0 = _mm_set1_ps(0.);
579  static const __m128 _1 = _mm_set1_ps(1.);
580  static const __m128 _2 = _mm_set1_ps(2.);
581  __m128 _n,_m,gI,gR,_p,_q;
582  gI = _mm_mul_ps(_sse_dot4_ps(_u,_v),_2); // ~sin(2*psi) or 2*u*v
583  gR = _mm_sub_ps(_sse_dot4_ps(_u,_u),_sse_dot4_ps(_v,_v)); // u^2-v^2
584  _p = _mm_and_ps(_mm_cmpge_ps(gR,_0),_1); // 1 if gR>0. or 0 if gR<0.
585  _q = _mm_sub_ps(_1,_p); // 0 if gR>0. or 1 if gR<0.
586  _n = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gI,gI),_mm_mul_ps(gR,gR))); // gc
587  gR = _mm_add_ps(_mm_andnot_ps(sm,gR),_mm_add_ps(_n,_o)); // gc+|gR|+eps
588  _n = _mm_add_ps(_mm_mul_ps(_2,_n),_o); // 2*gc + eps
589  gI = _mm_div_ps(gI,_n); // sin(2*psi)
590  _n = _mm_sqrt_ps(_mm_div_ps(gR,_n)); // sqrt((gc+|gR|)/(2gc+eps))
591  _m = _mm_and_ps(_mm_cmpge_ps(gI,_0),_1); // 1 if gI>0. or 0 if gI<0.
592  _m = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_m,_2),_1),_n); // _n if gI>0 or -_n if gI<0
593  *_s = _mm_add_ps(_mm_mul_ps(_q,_m),_mm_mul_ps(_p,_mm_div_ps(gI,_n))); // sin(psi)
594  gI = _mm_andnot_ps(sm,gI); // |gI|
595  *_c = _mm_add_ps(_mm_mul_ps(_p,_n),_mm_mul_ps(_q,_mm_div_ps(gI,_n))); // cos(psi)
596  return;
597 }
598 
599 static inline void _sse_ort4_ps(__m128* _s, __m128* _c, __m128 _r) {
600 // solves equation sin(2a) * _c - cos(2a) * _s = 0
601 // input: _s ~ sin(2a), _c ~ cos(2a) (not normalized), _r - regulator
602 // regularized _C = |_c| + _r*eps
603 // norm: _n = sqrt(_s*_s+_C*_C)
604 // cos(2a) = _C/_n
605 // sin(2a) = _s/_n
606 // output _s = sin(a), _c = cos(a)
607 // algorithm:
608 // _c>0: cos(a) = sqrt([1+cos(2a)]/2); sin(a) = sin(2a)/2/cos(a)
609 // _c<0: sin(a) = sign[_s]*sqrt([1+cos(2a)]/2); cos(a) = |sin(2a)|/2/|sin(a)|
610  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
611  static const __m128 _0 = _mm_set1_ps(0.);
612  static const __m128 _5 = _mm_set1_ps(0.5);
613  static const __m128 _1 = _mm_set1_ps(1.);
614  static const __m128 _2 = _mm_set1_ps(2.);
615  __m128 _n,_m,_C,_S,_p,_q;
616  _r = _mm_mul_ps(_mm_add_ps(_1,_r),_mm_set1_ps(1.e-6)); // regulator
617  _m = _mm_and_ps(_mm_cmpge_ps(*_s,_0),_1); // 1 if _S>0. or 0 if _S<0.
618  _m = _mm_sub_ps(_mm_mul_ps(_m,_2),_1); // sign(_s)
619  _p = _mm_and_ps(_mm_cmpge_ps(*_c,_0),_1); // 1 if _c>0. or 0 if _c<0.
620  _q = _mm_sub_ps(_1,_p); // 0 if _c>0. or 1 if _c<0.
621  _C = _mm_add_ps(_mm_andnot_ps(sm,*_c),_r); // |_c|+regulator
622  _n = _mm_add_ps(_mm_mul_ps(*_s,*_s),_mm_mul_ps(_C,_C)); // norm^2 for sin(2a) and cos(2a)
623  _n = _mm_div_ps(_1,_mm_mul_ps(_mm_sqrt_ps(_n),_2)); // 0.5/norm for sin(2a) and cos(2a)
624  _C = _mm_sqrt_ps(_mm_add_ps(_5,_mm_mul_ps(_C,_n))); // |cos(a)|
625  _S = _mm_div_ps(_mm_mul_ps(*_s,_n),_C); // sin(a)*cos(a)/|cos(a)|
626  *_s = _mm_add_ps(_mm_mul_ps(_p,_S),_mm_mul_ps(_q,_mm_mul_ps(_C,_m))); // sin(psi)
627  *_c = _mm_add_ps(_mm_mul_ps(_p,_C),_mm_mul_ps(_q,_mm_mul_ps(_S,_m))); // cos(psi)
628  return;
629 }
630 
631 static inline void _sse_dpf4_ps(__m128* _Fp, __m128* _Fx, __m128* _fp, __m128* _fx) {
632 // transformation to DPF for 4 consecutive pixels.
633 // rotate vectors Fp and Fx into DPF: fp and fx
634  __m128 _c, _s;
635  _sse_ort4_ps(_Fp,_Fx,&_s,&_c); // get sin and cos
636  _sse_rot4p_ps(_Fp,&_c,_Fx,&_s,_fp); // get fp=Fp*c+Fx*s
637  _sse_rot4m_ps(_Fx,&_c,_Fp,&_s,_fx); // get fx=Fx*c-Fp*s
638  return;
639 }
640 
641 static inline void _sse_pnp4_ps(__m128* _fp, __m128* _fx, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
642 // projection to network plane (pnp)
643 // _fp and _fx must be in DPF
644 // project vectors _am and _AM on the network plane _fp,_fx
645 // returns square of the network alignment factor
646 // c = xp/gp - cos of rotation to PCF
647 // s = xx/gx - sin of rotation to PCF
648  static const __m128 _o = _mm_set1_ps(1.e-24);
649  static const __m128 _1 = _mm_set1_ps(1.0);
650  __m128 gp = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fp,_fp),_o)); // 1/fp*fp
651  _sse_sub4_ps(_fx,_fp,_mm_mul_ps(gp,_sse_dot4_ps(_fp,_fx))); // check fx _|_ to fp
652  __m128 gx = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fx,_fx),_o)); // 1/fx*fx
653  __m128 cc = _mm_mul_ps(_sse_dot4_ps(_fp,_am),gp); // cos
654  __m128 ss = _mm_mul_ps(_sse_dot4_ps(_fx,_am),gx); // sin
655  _sse_rot4p_ps(_fp,&cc,_fx,&ss,_u); // get vector u
656  cc = _mm_mul_ps(_sse_dot4_ps(_fp,_AM),gp); // cos
657  ss = _mm_mul_ps(_sse_dot4_ps(_fx,_AM),gx); // sin
658  _sse_rot4p_ps(_fp,&cc,_fx,&ss,_v); // get vector v
659  return;
660 }
661 
662 static inline void _sse_dsp4_ps(__m128* u, __m128* v, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
663 // dual stream phase (dsp) transformation
664 // take projection vectors uu and vu,
665 // make them orthogonal, calculate dual stream phase
666 // apply phase transformation both to data and projections
667  __m128 _c, _s;
668  _sse_ort4_ps(u,v,&_s,&_c); // get sin and cos
669  _sse_rot4p_ps(u,&_c,v,&_s,_u); // get 00 response
670  _sse_rot4m_ps(v,&_c,u,&_s,_v); // get 90 response
671  _sse_rot4p_ps(_am,&_c,_AM,&_s,u); // get 00 data vector
672  _sse_rot4m_ps(_AM,&_c,_am,&_s,v); // get 90 data vector
673  _sse_cpf4_ps(_am,u);
674  _sse_cpf4_ps(_AM,v);
675  return;
676 }
677 
678 static inline __m128 _sse_ei4_ps(__m128* _u, __m128 _L) {
679 // returns incoherent energy for vector u
680 // calculates sum: u[k]*u[k]*u[k]*u[k]/L
681 // where L should be |u|^2
682  float x[4];
683  float o[4];
684  __m128* _a = _u;
685  __m128 _c;
686  _NET(
687  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0] = XSUM(x);,
688  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0]+= YSUM(x);
689  )
690  _NET(
691  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1] = XSUM(x);,
692  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1]+= YSUM(x);
693  )
694  _NET(
695  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2] = XSUM(x);,
696  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2]+= YSUM(x);
697  )
698  _NET(
699  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3] = XSUM(x);,
700  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3]+= YSUM(x);
701  )
702  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
703 }
704 
705 static inline __m128 _sse_ei4xx_ps(__m128* _x, __m128* _u, __m128 _L) {
706 // returns incoherent energy for vectors p,q
707 // _x - data vector
708 // _u - projection vector on the network plane
709 // calculates sum: x[k]*x[k]*u[k]*u[k]/L
710  float x[4];
711  float o[4];
712  __m128* _a = _x;
713  __m128* _b = _u;
714  __m128 _c;
715  _NET(
716  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
717  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
718  )
719  _NET(
720  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
721  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
722  )
723  _NET(
724  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
725  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
726  )
727  _NET(
728  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
729  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
730  )
731  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
732 }
733 
734 static inline __m128 _sse_ei4xu_ps(__m128* _x, __m128* _u, __m128 _L) {
735 // returns incoherent energy for vectors p,q
736 // _x - data vector
737 // _u - projection vector on the network plane
738 // calculates sum: x[k]*u[k]*u[k]*u[k]/L
739  float x[4];
740  float o[4];
741  __m128* _a = _x;
742  __m128* _b = _u;
743  __m128 _c;
744  _NET(
745  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0] = XSUM(x);,
746  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0]+= YSUM(x);
747  )
748  _NET(
749  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1] = XSUM(x);,
750  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1]+= YSUM(x);
751  )
752  _NET(
753  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2] = XSUM(x);,
754  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2]+= YSUM(x);
755  )
756  _NET(
757  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3] = XSUM(x);,
758  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3]+= YSUM(x);
759  )
760  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
761 }
762 
763 static inline __m128 _sse_null4_ps(__m128* _p, __m128* _q) {
764 // returns global NULL energy: |E-L-Ei+Li| + |Ei-Li|
765 // E = |p|^2; Ei = sum{p[k]^4}/|p|^2
766 // L = |q|^2; Li = sum{q[k]^4}/|q|^2
767  __m128 _sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
768  __m128 _pe = _sse_abs4_ps(_p); // get total energy for _p
769  __m128 _pi = _sse_ei4_ps(_p,_pe); // get incoherent energy for _p
770  __m128 _qe = _sse_abs4_ps(_q); // get total energy for _q
771  __m128 _qi = _sse_ei4_ps(_q,_qe); // get incoherent energy for _q
772  _pi = _mm_sub_ps(_pi,_qi); // Ei-Li
773  _pe = _mm_sub_ps(_mm_sub_ps(_pe,_qe),_pi); // (E-L) - (Ei-Li)
774  return _mm_add_ps(_mm_andnot_ps(_sm,_pi),_mm_andnot_ps(_sm,_pe));
775 }
776 
777 
778 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q, __m128 _L) {
779 // returns coherent energy given
780 // _p - data vector
781 // _q - network response
782 // calculates vector: u[k] = p[k]*q[k]
783 // returns: L - u[k]*u[k]/L,
784 // where L should be (q,q)^2
785  float x[4];
786  float o[4];
787  __m128* _a = _p;
788  __m128* _b = _q;
789  __m128 _c;
790  _NET(
791  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
792  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
793  )
794  _NET(
795  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
796  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
797  )
798  _NET(
799  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
800  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
801  )
802  _NET(
803  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
804  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
805  )
806  return _mm_sub_ps(_L,_mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12))));
807 }
808 
809 static inline __m128 _sse_ind4_ps(__m128* _p, __m128 _L) {
810 // returns vector index
811 // _p - data vector and L is its norm^2
812 // calculates vector: u[k] = p[k]*p[k]
813 // returns: u[k]*u[k]/L/L,
814  float x[4];
815  float o[4];
816  __m128* _a = _p;
817  __m128 _c;
818  _NET(
819  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
820  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
821  )
822  _NET(
823  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
824  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
825  )
826  _NET(
827  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
828  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
829  )
830  _NET(
831  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
832  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
833  )
834  _c = _mm_add_ps(_mm_mul_ps(_L,_L),_mm_set1_ps(1.e-16));
835  return _mm_div_ps(_mm_load_ps(o),_c);
836 }
837 
838 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q) {
839 // returns coherent part of (p,q)^2
840 // calculates reduced unity vector: u[k] = q * |q| /(p,q)
841 // returns: (p,u)^2 - sum_k{p[k]*p[k]*u[k]*u[k]}
842 // = |q|^2 - p[k]^2 * q[k]^2 * |q|^2/(p,q)^2
843 // = |q|^2 (1 - sum_k{p[k]^2*q[k]^2}/(p,q)^2
844  float x[4];
845  float o[4];
846  __m128* _a = _p;
847  __m128* _b = _q;
848  __m128 _c;
849  _NET(
850  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
851  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
852  )
853  _NET(
854  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
855  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
856  )
857  _NET(
858  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
859  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
860  )
861  _NET(
862  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
863  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
864  )
865 
866  _c = _sse_dot4_ps(_p,_q); // scalar product
867  _c = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_c,_c)); // (p,q)^2+1.e-12
868  _c = _mm_div_ps(_mm_load_ps(o),_c); // get incoherent part
869 
870  return _mm_mul_ps(_sse_abs4_ps(_q),_mm_sub_ps(_mm_set1_ps(1.),_c));
871 }
872 
873 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q, __m128 _L) {
874 // returns energy disbalance
875 // _p - data vector
876 // _q - network response
877 // calculates sum: 0.5*(p[k]*q[k]-q[k]*q[k])^2 / L
878  float x[4];
879  float o[4];
880  __m128* _a = _p;
881  __m128* _b = _q;
882  __m128 _aa;
883  _NET(
884  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
885  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
886  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
887  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
888  )
889  _NET(
890  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
891  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
892  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
893  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
894  )
895  _NET(
896  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
897  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
898  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
899  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
900  )
901  _NET(
902  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
903  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
904  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
905  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
906  )
907  _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5));
908  return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
909 }
910 
911 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q) {
912 // returns energy disbalance
913 // _p - data vector
914 // _q - network response (may not be a projection)
915 // calculates sum: 0.5*sum_k{(p[k]*q[k]-q[k]*q[k])^2} * (q,q)/(p,q)^2
916  float x[4];
917  float o[4];
918  __m128* _a = _p;
919  __m128* _b = _q;
920  __m128 _aa;
921  _NET(
922  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
923  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
924  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
925  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
926  )
927  _NET(
928  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
929  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
930  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
931  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
932  )
933  _NET(
934  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
935  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
936  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
937  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
938  )
939  _NET(
940  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
941  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
942  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
943  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
944  )
945  _aa = _sse_dot4_ps(_p,_q); // scalar product
946  _aa = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_aa,_aa)); // (p,q)^2+1.e-12
947  _aa = _mm_div_ps(_sse_abs4_ps(_q),_aa); // (q,q)/(p,q)^2
948 
949  return _mm_mul_ps(_aa,_mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5)));
950 }
951 
952 static inline __m128 _sse_ed4i_ps(__m128* _p, __m128* _q, __m128 _L) {
953 // returns incoherent part of energy disbalance
954 // _p - projection amplitude on the network
955 // calculates vector: v[k] = p[k]*p[k] * (q[k]*q[k]-p[k]*p[k])
956 // returns: Sum_k{v[k]/L}
957  float x[4];
958  float o[4];
959  __m128* _a = _p;
960  __m128* _b = _q;
961  __m128 _aa,_bb;
962  _NET(
963  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
964  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
965  _a++; _b++; o[0] = XSUM(x);,
966  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
967  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
968  _a++; _b++; o[0]+= YSUM(x);
969  )
970  _NET(
971  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
972  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
973  _a++; _b++; o[1] = XSUM(x);,
974  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
975  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
976  _a++; _b++; o[1]+= YSUM(x);
977  )
978  _NET(
979  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
980  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
981  _a++; _b++; o[2] = XSUM(x);,
982  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
983  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
984  _a++; _b++; o[2]+= YSUM(x);
985  )
986  _NET(
987  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
988  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
989  _a++; _b++; o[3] = XSUM(x);,
990  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
991  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
992  _a++; _b++; o[3]+= YSUM(x);
993  )
994  _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(2.));
995  return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
996 }
997 
998 static inline __m128 _sse_like4_ps(__m128* _f, __m128* _a, __m128* _A) {
999 // input ff - antenna pattern (f+ or fx) in DPF
1000 // input am,AM - network amplitude vectors
1001 // returns: (xp*xp+XP*XP)/|f+|^2 or (xx*xx+XX*XX)/|fx|^2
1002  __m128 xx = _sse_dot4_ps(_f,_a); // fp*am
1003  __m128 XX = _sse_dot4_ps(_f,_A); // fp*AM
1004  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1005  return _mm_div_ps(xx,_mm_add_ps(_sse_dot4_ps(_f,_f),_mm_set1_ps(1.e-12)));
1006 }
1007 
1008 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1009 // input fp,fx - antenna patterns in DPF
1010 // input am,AM - network amplitude vectors
1011 // input _D - (delta) - hard regulator
1012 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2+delta)
1013  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1014  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1015  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1016  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1017  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1018  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_D); // fx*fx + delta
1019  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1020  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1021  return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx)); // regularized projected energy
1022 }
1023 
1024 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1025 // input fp,fx - antenna patterns in DPF
1026 // input am,AM - network amplitude vectors
1027 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2)
1028  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1029  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1030  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1031  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1032  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1033  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1034  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1035  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1036  return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx)); // regularized projected energy
1037 }
1038 
1039 static inline __m128 _sse_like4w_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1040 // input fp,fx - antenna patterns in DPF
1041 // input am,AM - network amplitude vectors
1042 // returns: [(xp*xp+XP*XP)+(xx*xx+XX*XX)]/|f+|^2
1043  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1044  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1045  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1046  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1047  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-9)); // fp*fp + epsilon
1048  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1049  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1050  return _mm_div_ps(_mm_add_ps(xp,xx),gp); // regularized projected energy
1051 }
1052 
1053 static inline __m128 _sse_like4_ps(__m128* am, __m128* AM) {
1054 // input am,AM - network projection vectors
1055  return _mm_add_ps(_mm_add_ps(_sse_abs4_ps(am),_sse_abs4_ps(AM)),_mm_set1_ps(1.e-12));
1056 }
1057 
1058 static inline __m128 _sse_reg4x_ps(__m128 _L, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1059 // x regulator (incoherent)
1060 // input _L - non-regulated likelihood
1061 // input fx - antenna pattern in DPF
1062 // input am,AM - network amplitude vectors
1063 // input _D - (delta) - regulator
1064 // returns: (delta*Lx/L-|fx^2|)/|fx^2+delta|
1065  static const __m128 _o = _mm_set1_ps(1.e-12);
1066  __m128 FF = _mm_add_ps(_sse_dot4_ps(fx,fx),_o); // fx*fx
1067  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1068  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1069  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1070  xx = _mm_div_ps(_mm_mul_ps(xx,_D),_mm_mul_ps(_L,FF)); // (Lx/L)*delta
1071  return _mm_div_ps(_mm_sub_ps(FF,xx),_mm_add_ps(FF,_D)); // [|fx|^2-(Lx/L)*delta]/|fx^2+delta|
1072 }
1073 
1074 static inline __m128 _sse_nind4_ps(__m128* _am, __m128* _AM) {
1075 // calculates network index
1076 // input fx - antenna pattern in DPF
1077 // input am,AM - network projection vectors
1078  __m128 _ll = _sse_abs4_ps(_am); // L00
1079  __m128 _LL = _sse_abs4_ps(_AM); // L00
1080  __m128 _ei = _sse_ei4_ps(_am,_ll); // 00 incoherent
1081  __m128 _EI = _sse_ei4_ps(_AM,_LL); // 90 incoherent
1082  _ll = _mm_add_ps(_mm_add_ps(_ll,_LL),_mm_set1_ps(1.e-12)); // standard likelihood
1083  return _mm_div_ps(_mm_add_ps(_ei,_EI),_ll); // network index
1084 }
1085 
1086 static inline void _sse_pol4_ps(__m128* _fp, __m128* _fx, __m128* _v, double* r, double* a) {
1087 // calculates the polar coordinates of the input vector v in the DPF frame
1088 // input _fp,_fx - antenna patterns in DPF
1089 // input _v - network projection vector pointer
1090 // input r,a - polar coordinates (r : radius, a : angle in radians)
1091 // r,a are pointers to 4 dimensional float arrays
1092 
1093  __m128 _ss,_cc;
1094  __m128 _oo = _mm_set1_ps(1.e-12);
1095  float rpol[4],cpol[4],spol[4];
1096 
1097  _cc = _sse_abs4_ps(_fp); // |fp|^2
1098  _cc = _mm_add_ps(_mm_sqrt_ps(_cc),_oo); // |fp|
1099  _cc = _mm_div_ps(_sse_dot4_ps(_fp,_v),_cc); // (fp,v)/|fp|
1100  _mm_storeu_ps(cpol,_cc);
1101 
1102  _ss = _sse_abs4_ps(_fx); // |fx|^2
1103  _ss = _mm_add_ps(_mm_sqrt_ps(_ss),_oo); // |fx|
1104  _ss = _mm_div_ps(_sse_dot4_ps(_fx,_v),_ss); // (fx,v)/|fx|
1105  _mm_storeu_ps(spol,_ss);
1106 
1107  _mm_storeu_ps(rpol,_sse_abs4_ps(_v)); // |v|^2
1108 
1109  for(int n=0;n<4;n++) {
1110  r[n] = sqrt(rpol[n]); // |v|
1111  a[n] = atan2(spol[n],cpol[n]); // atan2(spol,cpol)
1112  }
1113 
1114  return;
1115 }
1116 
1117 /*
1118 static inline __m128 _sse_reg4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D, __m128 _G) {
1119 // input fp,fx - antenna patterns in DPF
1120 // input am,AM - network amplitude vectors
1121 // input _D - (delta) - Tikhonov constraint
1122 // input _G - (gamma) - polarization constraint
1123 // calculate S^2 = sin(psi)^2, where psi is the angle between f+ and (xi+XI).
1124 // xi is the projection of x00 (_am) on the f+,fx plane
1125 // XI is the projection of x90 (_AM) on the f+,fx plane
1126 // is asymmetry s/(s+S)<T, return 0 otherwise return |xi|^2+|XI|^2
1127 // where s^2 = 4*|f+|*|fx|/(|f+|^2+|fx|^2)^2 and
1128 // and S^2 = 1-(|f+,xi|-|f+,XI|)^2/|f+|^2/(x00^2+x90^2)
1129 // the actual condition checked: s^2 < G*S^2
1130 
1131  static const __m128 sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
1132  static const __m128 _1 = _mm_set1_ps(1);
1133  static const __m128 _2 = _mm_set1_ps(2);
1134  static const __m128 _o = _mm_set1_ps(1.e-12); // epsilon
1135  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1136  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1137  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1138  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1139  __m128 cc = _mm_sub_ps(_mm_mul_ps(xx,XP),_mm_mul_ps(xp,XX)); // cc=xx*XP-xp*XX
1140  __m128 ss = _mm_add_ps(_mm_mul_ps(xx,xp),_mm_mul_ps(XX,XP)); // ss=xx*xp+XX*XP
1141  cc = _mm_andnot_ps(sm,_mm_mul_ps(_mm_mul_ps(cc,ss),_2)); // cc=|2*cc*ss| (chk-ed)
1142  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1143  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1144  __m128 gp = _sse_dot4_ps(fp,fp); // fp*fp
1145  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_o); // fx*fx + epsilon
1146  XX = _mm_add_ps(_mm_mul_ps(xx,gp),_mm_mul_ps(xp,gx)); // XX=xx*gp+xp*gx
1147  ss = _mm_add_ps(_sse_dot4_ps(am,am),_sse_dot4_ps(AM,AM)); // total energy
1148  ss = _mm_add_ps(ss,_D); // total energy + delta
1149  cc = _mm_div_ps(cc,_mm_add_ps(_mm_mul_ps(xx,ss),_o)); // second psi term - added
1150  ss = _mm_div_ps(xp,_mm_add_ps(_mm_mul_ps(gp,ss),_o)); // first psi term - subtracted
1151  ss = _mm_mul_ps(_G,_mm_add_ps(_mm_sub_ps(_1,ss),cc)); // G*sin(psi)^2
1152  cc = _mm_div_ps(_2,_mm_add_ps(gp,gx)); // cc=2*(1-T)/(gp+gx)
1153  cc = _mm_mul_ps(_mm_mul_ps(cc,cc),_mm_mul_ps(gp,gx)); // right is ready for comparison
1154  //cc = _mm_sqrt_ps(cc);
1155  //ss = _mm_sqrt_ps(ss);
1156  //cc = _mm_div_ps(cc,_mm_add_ps(ss,cc));
1157  cc = _mm_and_ps(_mm_cmpge_ps(cc,ss),_1); // 1 if cc>ss or 0 if cc<ss
1158  XX = _mm_div_ps(XX,_mm_mul_ps(_mm_add_ps(gp,_D),gx)); // std L with Tikhonov regulator
1159  return XX; // final projected energy
1160  return _mm_mul_ps(XX,cc); // final projected energy
1161 }
1162 
1163 
1164 static inline __m128 _sse_asy4_ps(__m128* _fp, __m128* _fx, __m128* _aa, __m128* _AA) {
1165 // calculate sample network asymmetry
1166 // _a - alignment factors
1167 // _e - ellipticity factors
1168 // _c - cos between f+ and 00 projection
1169 // return network asymmetry
1170  static const __m128 sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
1171  static const __m128 _1 = _mm_set1_ps(1.);
1172  static const __m128 _2 = _mm_set1_ps(2.);
1173  static const __m128 _o = _mm_set1_ps(1.e-4);
1174  __m128 _a = _sse_dot4_ps(_fp,_fp); // |fp|^2
1175  __m128 _e = _sse_dot4_ps(_aa,_aa); // |x00|^2
1176  __m128 _c = _sse_dot4_ps(_fp,_aa); // (fp,aa)
1177  _c = _mm_div_ps(_mm_mul_ps(_c,_c),_mm_mul_ps(_a,_e)); // cos^2(psi)
1178  _a = _mm_add_ps(_mm_div_ps(_sse_dot4_ps(_fx,_fx),_a),_o); // a = (|fx|/|fp|)^2
1179  _e = _mm_div_ps(_sse_dot4_ps(_AA,_AA),_mm_mul_ps(_a,_e)); // (|x90|/|x00|)^2/a
1180  _e = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_e))); // ee = 1/(1+sqrt(ee))
1181  _c = _mm_mul_ps(_mm_sub_ps(_1,_c),_mm_add_ps(_1,_a)); // _c = s*s*(1+a)
1182  _a = _mm_div_ps(_c,_mm_mul_ps(_2,_a)); // _a = s*s*(1+a)/(2*a)
1183  _a = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_a))); // 1/(1+sqrt(aa))
1184  _a = _mm_div_ps(_mm_add_ps(_a,_e),_2); // (aa+ee)/2
1185  _e = _mm_andnot_ps(sm,_mm_sub_ps(_a,_e)); // |aa-ee|/2
1186  return _mm_sub_ps(_a,_e); // (aa-ee)/2-|aa-ee|/2
1187 }
1188 
1189 
1190 static inline float _sse_snc4_ps(__m128* _pe, float* _ne, float* out, int V4) {
1191 // sub net cut (snc)
1192 // _pe - pointer to energy vectors
1193 // _rE - pointer to network energy array
1194 // out - output array
1195 // V4 - number of iterations (pixels)
1196  __m128 _Etot = _mm_setzero_ps(); // total energy
1197  __m128 _Esub = _mm_setzero_ps(); // energy after subnet cut
1198  __m128* _rE = (__m128*) ne; // m128 pointer to energy array
1199 
1200  for(j=0; j<V4; j+=4) { // loop over selected pixels
1201  *_rE = _net_sum_ps(_pe);
1202 
1203  __m128 _cmp = _mm_cmpge_ps(*_rE,_En); // E>En
1204  __m128 _msk = _mm_and_ps(_cmp,_one); // 0/1 mask
1205 
1206  *_rE = _mm_mul_ps(*_rE,_msk); // zero sub-threshold pixels
1207  _Etot = _mm_add_ps(_Etot,*_rE);
1208 
1209  _cmp = _net_cut_ps(_rE, _pe, _Es, _cmp); // apply subnetwork cut
1210 
1211  _msk = _mm_and_ps(_cmp,_one); // 0/1 mask
1212  _Esub = _mm_add_ps(_Esub,_mm_mul_ps(*_rE++,_msk));
1213  *_pE++ = _mm_setzero_ps();
1214  }
1215 
1216  _mm_storeu_ps(etot,_Etot);
1217 }
1218 */
1219 /*
1220  gx = NET.dotx(Fx,Fx)+1.e-24;
1221  gI = NET.dotx(Fp,Fx);
1222  xp = NET.dotx(Fp,am);
1223  xx = NET.dotx(Fx,am);
1224  XP = NET.dotx(Fp,AM);
1225  XX = NET.dotx(Fx,AM);
1226 
1227 // find weak vector 00
1228 
1229  uc = (xp*gx - xx*gI); // u cos of rotation to PCF
1230  us = (xx*gp - xp*gI); // u sin of rotation to PCF
1231  um = NET.rotx(Fp,uc,Fx,us,u); // calculate u and return its norm
1232  et = NET.dotx(am,am);
1233  hh = NET.dotx(am,u,e);
1234  ec = (hh*hh - NET.dotx(e,e))/um;
1235 
1236  if(nn==0) SM.set(n,0.1*hh*hh/NET.dotx(e,e));
1237  else SM.add(n,0.1*hh*hh/NET.dotx(e,e));
1238 
1239  NET.rotx(u,hh/um,e,0.,aa); // normalize aa
1240 // ec = NET.dotx(aa,aa);
1241 
1242 // find weak vector 90
1243 
1244  uc = (XP*gx - XX*gI); // u cos of rotation to PCF
1245  us = (XX*gp - XP*gI); // u sin of rotation to PCF
1246  UM = NET.rotx(Fp,uc,Fx,us,U); // calculate u and return its norm
1247  ET = NET.dotx(AM,AM);
1248  HH = NET.dotx(AM,U,e);
1249  EC = (HH*HH - NET.dotx(e,e))/UM;
1250  NET.rotx(U,HH/UM,e,0.,AA); // normalize AA
1251 // EC = NET.dotx(AA,AA);
1252 // g2->Fill((ec+EC)/(et+ET));
1253 
1254 // transformation to DDF
1255 
1256  gp = NET.dotx(aa,aa)+1.e-24; // fp^2
1257  gx = NET.dotx(AA,AA)+1.e-24; // fx^2
1258  gI = NET.dotx(aa,AA); // fp*fx
1259  gR = (gp-gx)/2.;
1260  gr = (gp+gx)/2.;
1261  gc = sqrt(gR*gR+gI*gI); // norm of complex antenna pattern
1262  b = (gr-gc)/(gr+gc);
1263 
1264 // g0->Fill(sqrt(b));
1265 
1266  cc = sqrt((gc+gR)*(gc+gR)+gI*gI);
1267  ss = NET.rotx(aa,(gc+gR)/cc,AA,gI/cc,s); // s[k]
1268  xx = NET.rotx(am,(gc+gR)/cc,AM,gI/cc,x); // x[k]
1269  cc = sqrt((gc-gR)*(gc-gR)+gI*gI);
1270  SS = NET.rotx(aa,-(gc-gR)/cc,AA,gI/cc,S)+1.e-24; // S[k]
1271  XX = NET.rotx(am,-(gc-gR)/cc,AM,gI/cc,X)+1.e-24; // X[k]
1272 
1273 // cout<<ss<<" "<<NET.dotx(x,s)*NET.dotx(x,s)/ss<<endl;
1274 
1275 // Principle components
1276 
1277  hh = NET.dotx(x,s,e);
1278  ec = (hh*hh - NET.dotx(e,e))/ss;
1279  HH = NET.dotx(X,S,e);
1280  EC = (HH*HH - NET.dotx(e,e))/SS;
1281 
1282 */
1283 
1284 
1285 
1286 #endif // WATSSE_HH
1287 
1288 
1289 
1290 
1291 
1292 
1293 
1294 
1295 
1296 
1297 
1298 
1299 
1300 
1301 
1302 
1303 
static float _sse_abs_ps(__m128 *_a)
Definition: watsse.hh:137
static float _sse_dot_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:213
static void _sse_hard4_ps(__m128 *_uu, __m128 *_am, __m128 *_AM, __m128 _c)
Definition: watsse.hh:84
static __m128 _sse_rnorm4_ps(__m128 *_p)
Definition: watsse.hh:189
static __m128 _sse_reg4x_ps(__m128 _L, __m128 *fx, __m128 *am, __m128 *AM, __m128 _D)
Definition: watsse.hh:1058
wavearray< double > a(hp.size())
WSeries< float > v[nIFO]
Definition: cwb_net.C:80
static void _sse_add4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:262
int n
Definition: cwb_net.C:28
static __m128 _sse_abs4_ps(__m128 *_p)
Definition: watsse.hh:157
static void _sse_zero_ps(__m128 *_p)
Definition: watsse.hh:44
static __m128 _sse_ed4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:873
static void _sse_dsp4_ps(__m128 *u, __m128 *v, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:662
static __m128 _sse_ei4xu_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:734
static __m128 _sse_dot4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:223
ofstream out
Definition: cwb_merge.C:214
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\ layers : "<< nLAYERS<< "\ dF(hz) : "<< dF<< "\ dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1) *itime+ifreq;double time=itime *dT;double freq=(ifreq >0) ? ifreq *dF :dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static void _sse_cpf4_ps(__m128 *_aa, __m128 *_pp)
Definition: watsse.hh:317
static float _sse_rotsub_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:399
static __m128 _sse_nind4_ps(__m128 *_am, __m128 *_AM)
Definition: watsse.hh:1074
#define _NET(P1, P2)
Definition: wat.hh:82
static void _sse_cpf_ps(float *a, __m128 *_p)
Definition: watsse.hh:307
int m
Definition: cwb_net.C:28
static void _sse_rotm_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:424
static void _sse_rotadd_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:388
cout<< "SNR "<< xsnr<< endl;wavearray< double > f
Definition: ComputeSNR.C:75
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _sse_nrg_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:372
wavearray< double > xx
Definition: TestFrame1.C:11
static __m128 _sse_ecoh4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:778
gwavearray< double > * gx
static void _sse_ifcp4_ps(__m128 *_aa, __m128 *_bb, __m128 _c)
Definition: watsse.hh:116
static __m128 _sse_ind4_ps(__m128 *_p, __m128 _L)
Definition: watsse.hh:809
printf("total live time: non-zero lags = %10.1f \, liveTot)
int l
static void _sse_pol4_ps(__m128 *_fp, __m128 *_fx, __m128 *_v, double *r, double *a)
Definition: watsse.hh:1086
static __m128 _sse_like4w_ps(__m128 *fp, __m128 *fx, __m128 *am, __m128 *AM)
Definition: watsse.hh:1039
static void _sse_rot4m_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:472
static void _sse_load_ps(__m128 *_p, float a)
Definition: watsse.hh:50
double e
regression r
Definition: Regression_H1.C:44
s s
Definition: cwb_net.C:155
static void _sse_add_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:248
static void _sse_mul4_ps(__m128 *_am, __m128 _c)
Definition: watsse.hh:75
static __m128 _sse_ei4_ps(__m128 *_u, __m128 _L)
Definition: watsse.hh:678
static void _sse_print_ps(__m128 *_p)
Definition: watsse.hh:37
static __m128 _sse_ed4i_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:952
static __m128 _sse_sum_ps(__m128 **_p)
Definition: watsse.hh:515
static void _sse_point_ps(__m128 **_p, float **p, short **m, int l, int n)
Definition: watsse.hh:502
static void _sse_mul_ps(__m128 *_a, float b)
Definition: watsse.hh:56
static __m128 _sse_div4_ps(__m128 *_v, __m128 *_u)
Definition: watsse.hh:181
static __m128 _sse_null4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:763
static float _sse_maxE_ps(__m128 *_a, __m128 *_A)
Definition: watsse.hh:554
static void _sse_minSNE_ps(__m128 *_pE, __m128 **_pe, __m128 *_es)
Definition: watsse.hh:540
static __m128 _sse_like4_ps(__m128 *_f, __m128 *_a, __m128 *_A)
Definition: watsse.hh:998
static void _sse_rot4p_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:442
static void _sse_rotp_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:416
static void _sse_sub4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:288
static void _sse_dpf4_ps(__m128 *_Fp, __m128 *_Fx, __m128 *_fp, __m128 *_fx)
Definition: watsse.hh:631
static __m128 _sse_cut_ps(__m128 *_pE, __m128 **_pe, __m128 _Es, __m128 _cmp)
Definition: watsse.hh:528
static void _sse_pnp4_ps(__m128 *_fp, __m128 *_fx, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:641
static __m128 _sse_ei4xx_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:705
static void _sse_ort4_ps(__m128 *_u, __m128 *_v, __m128 *_s, __m128 *_c)
Definition: watsse.hh:572
static void _sse_sub_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:281