Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft_int16.neonintrinsic.c
1 /*
2  * Copyright 2013-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int16.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 
38 #define FFT4_FS_START \
39  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
40  ne10_int16_t tmp_r, tmp_i;
41 
42 
43 #define FFT4_FS \
44  s2_r = Fin[0].r - Fin[2].r; \
45  s2_i = Fin[0].i - Fin[2].i; \
46  tmp_r = Fin[0].r + Fin[2].r; \
47  tmp_i = Fin[0].i + Fin[2].i; \
48  s0_r = Fin[1].r + Fin[3].r; \
49  s0_i = Fin[1].i + Fin[3].i; \
50  s1_r = Fin[1].r - Fin[3].r; \
51  s1_i = Fin[1].i - Fin[3].i;
52 
53 #define FFT4_FS_SCALED \
54  s2_r = (Fin[0].r - Fin[2].r) >> 2; \
55  s2_i = (Fin[0].i - Fin[2].i) >> 2; \
56  tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
57  tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
58  s0_r = (Fin[1].r + Fin[3].r) >> 2; \
59  s0_i = (Fin[1].i + Fin[3].i) >> 2; \
60  s1_r = (Fin[1].r - Fin[3].r) >> 2; \
61  s1_i = (Fin[1].i - Fin[3].i) >> 2;
62 
63 #define FFT4_FWD_LS \
64  Fout[2].r = tmp_r - s0_r; \
65  Fout[2].i = tmp_i - s0_i; \
66  Fout[0].r = tmp_r + s0_r; \
67  Fout[0].i = tmp_i + s0_i; \
68  Fout[1].r = s2_r + s1_i; \
69  Fout[1].i = s2_i - s1_r; \
70  Fout[3].r = s2_r - s1_i; \
71  Fout[3].i = s2_i + s1_r;
72 
73 #define FFT4_INV_LS \
74  Fout[2].r = tmp_r - s0_r; \
75  Fout[2].i = tmp_i - s0_i; \
76  Fout[0].r = tmp_r + s0_r; \
77  Fout[0].i = tmp_i + s0_i; \
78  Fout[1].r = s2_r - s1_i; \
79  Fout[1].i = s2_i + s1_r; \
80  Fout[3].r = s2_r + s1_i; \
81  Fout[3].i = s2_i - s1_r;
82 
83 static inline void ne10_fft4_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
85 
86 {
87  FFT4_FS_START
88  FFT4_FS
89  FFT4_FWD_LS
90 }
91 
92 static inline void ne10_fft4_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
94 
95 {
96  FFT4_FS_START
97  FFT4_FS
98  FFT4_INV_LS
99 }
100 static inline void ne10_fft4_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
101  ne10_fft_cpx_int16_t * Fin)
102 
103 {
104  FFT4_FS_START
105  FFT4_FS_SCALED
106  FFT4_FWD_LS
107 }
108 
109 static inline void ne10_fft4_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
110  ne10_fft_cpx_int16_t * Fin)
111 
112 {
113  FFT4_FS_START
114  FFT4_FS_SCALED
115  FFT4_INV_LS
116 }
117 
118 #define FFT8_FS_START \
119  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
120  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
121  const ne10_int16_t TW_81 = 23169;
122 
123 #define FFT8_FS \
124  s0_r = Fin[0].r + Fin[4].r; \
125  s0_i = Fin[0].i + Fin[4].i; \
126  s1_r = Fin[0].r - Fin[4].r; \
127  s1_i = Fin[0].i - Fin[4].i; \
128  s2_r = Fin[1].r + Fin[5].r; \
129  s2_i = Fin[1].i + Fin[5].i; \
130  s3_r = Fin[1].r - Fin[5].r; \
131  s3_i = Fin[1].i - Fin[5].i; \
132  s4_r = Fin[2].r + Fin[6].r; \
133  s4_i = Fin[2].i + Fin[6].i; \
134  s5_r = Fin[2].r - Fin[6].r; \
135  s5_i = Fin[2].i - Fin[6].i; \
136  s6_r = Fin[3].r + Fin[7].r; \
137  s6_i = Fin[3].i + Fin[7].i; \
138  s7_r = Fin[3].r - Fin[7].r; \
139  s7_i = Fin[3].i - Fin[7].i;
140 
141 #define FFT8_FS_SCALED \
142  s0_r = (Fin[0].r + Fin[4].r) >> 3; \
143  s0_i = (Fin[0].i + Fin[4].i) >> 3; \
144  s1_r = (Fin[0].r - Fin[4].r) >> 3; \
145  s1_i = (Fin[0].i - Fin[4].i) >> 3; \
146  s2_r = (Fin[1].r + Fin[5].r) >> 3; \
147  s2_i = (Fin[1].i + Fin[5].i) >> 3; \
148  s3_r = (Fin[1].r - Fin[5].r) >> 3; \
149  s3_i = (Fin[1].i - Fin[5].i) >> 3; \
150  s4_r = (Fin[2].r + Fin[6].r) >> 3; \
151  s4_i = (Fin[2].i + Fin[6].i) >> 3; \
152  s5_r = (Fin[2].r - Fin[6].r) >> 3; \
153  s5_i = (Fin[2].i - Fin[6].i) >> 3; \
154  s6_r = (Fin[3].r + Fin[7].r) >> 3; \
155  s6_i = (Fin[3].i + Fin[7].i) >> 3; \
156  s7_r = (Fin[3].r - Fin[7].r) >> 3; \
157  s7_i = (Fin[3].i - Fin[7].i) >> 3;
158 
159 
160 #define FFT8_FWD_LS \
161  t0_r = s0_r - s4_r; \
162  t0_i = s0_i - s4_i; \
163  t1_r = s0_r + s4_r; \
164  t1_i = s0_i + s4_i; \
165  t2_r = s2_r + s6_r; \
166  t2_i = s2_i + s6_i; \
167  t3_r = s2_r - s6_r; \
168  t3_i = s2_i - s6_i; \
169  Fout[0].r = t1_r + t2_r; \
170  Fout[0].i = t1_i + t2_i; \
171  Fout[4].r = t1_r - t2_r; \
172  Fout[4].i = t1_i - t2_i; \
173  Fout[2].r = t0_r + t3_i; \
174  Fout[2].i = t0_i - t3_r; \
175  Fout[6].r = t0_r - t3_i; \
176  Fout[6].i = t0_i + t3_r; \
177  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
178  t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
179  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
180  t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
181  t0_r = s1_r - s5_i; \
182  t0_i = s1_i + s5_r; \
183  t1_r = s1_r + s5_i; \
184  t1_i = s1_i - s5_r; \
185  t2_r = t4_r - t5_r; \
186  t2_i = t4_i - t5_i; \
187  t3_r = t4_r + t5_r; \
188  t3_i = t4_i + t5_i; \
189  Fout[1].r = t1_r + t2_r; \
190  Fout[1].i = t1_i + t2_i; \
191  Fout[5].r = t1_r - t2_r; \
192  Fout[5].i = t1_i - t2_i; \
193  Fout[3].r = t0_r + t3_i; \
194  Fout[3].i = t0_i - t3_r; \
195  Fout[7].r = t0_r - t3_i; \
196  Fout[7].i = t0_i + t3_r;
197 
198 #define FFT8_INV_LS \
199  t0_r = s0_r - s4_r; \
200  t0_i = s0_i - s4_i; \
201  t1_r = s0_r + s4_r; \
202  t1_i = s0_i + s4_i; \
203  t2_r = s2_r + s6_r; \
204  t2_i = s2_i + s6_i; \
205  t3_r = s2_r - s6_r; \
206  t3_i = s2_i - s6_i; \
207  Fout[0].r = t1_r + t2_r; \
208  Fout[0].i = t1_i + t2_i; \
209  Fout[4].r = t1_r - t2_r; \
210  Fout[4].i = t1_i - t2_i; \
211  Fout[2].r = t0_r - t3_i; \
212  Fout[2].i = t0_i + t3_r; \
213  Fout[6].r = t0_r + t3_i; \
214  Fout[6].i = t0_i - t3_r; \
215  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
216  t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
217  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
218  t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
219  t0_r = s1_r + s5_i; \
220  t0_i = s1_i - s5_r; \
221  t1_r = s1_r - s5_i; \
222  t1_i = s1_i + s5_r; \
223  t2_r = t4_r - t5_r; \
224  t2_i = t4_i - t5_i; \
225  t3_r = t4_r + t5_r; \
226  t3_i = t4_i + t5_i; \
227  Fout[1].r = t1_r + t2_r; \
228  Fout[1].i = t1_i + t2_i; \
229  Fout[5].r = t1_r - t2_r; \
230  Fout[5].i = t1_i - t2_i; \
231  Fout[3].r = t0_r - t3_i; \
232  Fout[3].i = t0_i + t3_r; \
233  Fout[7].r = t0_r + t3_i; \
234  Fout[7].i = t0_i - t3_r;
235 
236 static inline void ne10_fft8_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
237  ne10_fft_cpx_int16_t * Fin)
238 
239 {
240  FFT8_FS_START
241  FFT8_FS
242  FFT8_FWD_LS
243 }
244 
245 static inline void ne10_fft8_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
246  ne10_fft_cpx_int16_t * Fin)
247 
248 {
249  FFT8_FS_START
250  FFT8_FS
251  FFT8_INV_LS
252 }
253 static inline void ne10_fft8_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
254  ne10_fft_cpx_int16_t * Fin)
255 
256 {
257  FFT8_FS_START
258  FFT8_FS_SCALED
259  FFT8_FWD_LS
260 }
261 
262 static inline void ne10_fft8_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
263  ne10_fft_cpx_int16_t * Fin)
264 
265 {
266  FFT8_FS_START
267  FFT8_FS_SCALED
268  FFT8_INV_LS
269 }
270 
271 #define RADIX8x4_START \
272  ne10_int32_t f_count; \
273  ne10_int32_t src_step = stride << 1; \
274  const ne10_int16_t TW_81 = 23169; \
275  const ne10_int16_t TW_81N = -23169; \
276  int16_t *p_src, *p_dst; \
277  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \
278  int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \
279  int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \
280  int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \
281  int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \
282  int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \
283  int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
284  int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \
285  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \
286  int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
287  int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
288  int16x4_t d_tw_81, d_tw_81n; \
289  p_src = (int16_t *) Fin; \
290  p_dst = (int16_t *) Fout;
291 
292 
293 #define RADIX8x4_LOAD \
294  d2_in0 = vld2_s16 (p_src); \
295  p_src += src_step; \
296  d2_in2 = vld2_s16 (p_src); \
297  p_src += src_step; \
298  d2_in4 = vld2_s16 (p_src); \
299  p_src += src_step; \
300  d2_in6 = vld2_s16 (p_src); \
301  p_src += src_step; \
302  d2_in1 = vld2_s16 (p_src); \
303  p_src += src_step; \
304  d2_in3 = vld2_s16 (p_src); \
305  p_src += src_step; \
306  d2_in5 = vld2_s16 (p_src); \
307  p_src += src_step; \
308  d2_in7 = vld2_s16 (p_src); \
309  p_src += src_step;
310 
311 #define RADIX8x4_STORE \
312  q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
313  q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
314  q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \
315  q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \
316  q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
317  q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
318  q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \
319  q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \
320  d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
321  d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
322  d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
323  d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
324  d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
325  d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
326  d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
327  d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
328  d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
329  d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
330  d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
331  d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
332  d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
333  d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
334  d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
335  d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
336  vst2_s16 (p_dst, d2_out0); \
337  p_dst += 8; \
338  vst2_s16 (p_dst, d2_out1); \
339  p_dst += 8; \
340  vst2_s16 (p_dst, d2_out2); \
341  p_dst += 8; \
342  vst2_s16 (p_dst, d2_out3); \
343  p_dst += 8; \
344  vst2_s16 (p_dst, d2_out4); \
345  p_dst += 8; \
346  vst2_s16 (p_dst, d2_out5); \
347  p_dst += 8; \
348  vst2_s16 (p_dst, d2_out6); \
349  p_dst += 8; \
350  vst2_s16 (p_dst, d2_out7); \
351  p_dst += 8; \
352  p_src = p_src - src_step * 8 + 8;
353 
354 #define RADIX8x4_FS_S0 \
355  d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
356  d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
357  d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
358  d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
359  d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
360  d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
361  d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
362  d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
363  d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
364  d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
365  d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
366  d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
367  d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
368  d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
369  d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
370  d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]);
371 
372 #define RADIX8x4_FWD_S357 \
373  d_tw_81 = vdup_n_s16 (TW_81); \
374  d_tw_81n = vdup_n_s16 (TW_81N); \
375  d_s5_r = d_sin5_i; \
376  d_s5_i = vneg_s16 (d_sin5_r); \
377  d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \
378  d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \
379  d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \
380  d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \
381  d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
382  d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
383  d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
384  d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
385 
386 #define RADIX8x4_INV_S357 \
387  d_tw_81 = vdup_n_s16 (TW_81); \
388  d_tw_81n = vdup_n_s16 (TW_81N); \
389  d_s5_r = vneg_s16 (d_sin5_i); \
390  d_s5_i = d_sin5_r; \
391  d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \
392  d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \
393  d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \
394  d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \
395  d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
396  d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
397  d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
398  d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
399 
400 #define RADIX8x4_LS_02 \
401  d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \
402  d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \
403  d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \
404  d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \
405  d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \
406  d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \
407  d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \
408  d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \
409  d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \
410  d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \
411  d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \
412  d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \
413  d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \
414  d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \
415  d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \
416  d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \
417  d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \
418  d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \
419  d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \
420  d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \
421  d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \
422  d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \
423  d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \
424  d_out1_i = vadd_s16 (d_s9_i, d_s13_i);
425 
426 #define RADIX8x4_FS_S0_SCALED \
427  d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
428  d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
429  d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
430  d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
431  d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
432  d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
433  d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
434  d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
435  d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
436  d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
437  d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
438  d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
439  d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
440  d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
441  d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
442  d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]);
443 
444 #define RADIX8x4_LS_02_SCALED \
445  d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \
446  d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \
447  d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \
448  d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \
449  d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \
450  d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \
451  d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \
452  d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \
453  d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \
454  d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \
455  d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \
456  d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \
457  d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \
458  d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \
459  d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \
460  d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \
461  d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \
462  d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \
463  d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \
464  d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \
465  d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \
466  d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \
467  d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \
468  d_out1_i = vhadd_s16 (d_s9_i, d_s13_i);
469 
470 
471 static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
472  ne10_fft_cpx_int16_t * Fin,
473  ne10_int32_t stride)
474 {
475  RADIX8x4_START
476 
477  for (f_count = 0; f_count < stride; f_count += 4)
478  {
479  RADIX8x4_LOAD
480  RADIX8x4_FS_S0
481 
482 
483  // radix 4 butterfly without twiddles
484  RADIX8x4_FWD_S357
485  RADIX8x4_LS_02
486 
487  d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
488  d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
489  d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
490  d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
491  d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
492  d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
493  d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
494  d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
495 
496  RADIX8x4_STORE
497  } // f_count
498 }
499 
500 static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
501  ne10_fft_cpx_int16_t * Fin,
502  ne10_int32_t stride)
503 {
504  RADIX8x4_START
505 
506  for (f_count = 0; f_count < stride; f_count += 4)
507  {
508  RADIX8x4_LOAD
509  RADIX8x4_FS_S0
510 
511  // radix 4 butterfly without twiddles
512  RADIX8x4_INV_S357
513  RADIX8x4_LS_02
514 
515  d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
516  d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
517  d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
518  d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
519  d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
520  d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
521  d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
522  d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
523 
524  RADIX8x4_STORE
525  } // f_count
526 }
527 static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
528  ne10_fft_cpx_int16_t * Fin,
529  ne10_int32_t stride)
530 {
531  RADIX8x4_START
532 
533  for (f_count = 0; f_count < stride; f_count += 4)
534  {
535  RADIX8x4_LOAD
536  RADIX8x4_FS_S0_SCALED
537 
538  // radix 4 butterfly without twiddles
539  RADIX8x4_FWD_S357
540  RADIX8x4_LS_02_SCALED
541 
542  d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
543  d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
544  d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
545  d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
546  d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
547  d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
548  d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
549  d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
550 
551  RADIX8x4_STORE
552  } // f_count
553 }
554 
555 static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
556  ne10_fft_cpx_int16_t * Fin,
557  ne10_int32_t stride)
558 {
559  RADIX8x4_START
560 
561  for (f_count = 0; f_count < stride; f_count += 4)
562  {
563  RADIX8x4_LOAD
564  RADIX8x4_FS_S0_SCALED
565 
566  // radix 4 butterfly without twiddles
567  RADIX8x4_INV_S357
568  RADIX8x4_LS_02_SCALED
569 
570  d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
571  d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
572  d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
573  d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
574  d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
575  d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
576  d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
577  d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
578 
579  RADIX8x4_STORE
580  } // f_count
581 }
582 
583 #define RADIX4x4_WITHOUT_TW_START \
584  ne10_int32_t f_count; \
585  ne10_int32_t src_step = stride << 1; \
586  int16_t *p_src, *p_dst; \
587  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
588  int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
589  int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
590  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
591  int16x8x2_t q2_tmp0, q2_tmp1; \
592  int32x4x2_t q2_tmp2, q2_tmp3; \
593  p_src = (int16_t *) Fin; \
594  p_dst = (int16_t *) Fout;
595 
596 #define RADIX4x4_WITHOUT_TW_LOAD \
597  d2_in0 = vld2_s16 (p_src); \
598  p_src += src_step; \
599  d2_in1 = vld2_s16 (p_src); \
600  p_src += src_step; \
601  d2_in2 = vld2_s16 (p_src); \
602  p_src += src_step; \
603  d2_in3 = vld2_s16 (p_src); \
604  p_src += src_step;
605 
606 #define RADIX4x4_WITHOUT_TW_STORE \
607  q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
608  q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
609  q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
610  q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
611  d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
612  d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
613  d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
614  d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
615  d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
616  d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
617  d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
618  d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
619  vst2_s16 (p_dst, d2_out0); \
620  p_dst += 8; \
621  vst2_s16 (p_dst, d2_out1); \
622  p_dst += 8; \
623  vst2_s16 (p_dst, d2_out2); \
624  p_dst += 8; \
625  vst2_s16 (p_dst, d2_out3); \
626  p_dst += 8; \
627  p_src = p_src - src_step * 4 + 8;
628 
629 #define RADIX4x4_WITHOUT_TW_S0 \
630  d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
631  d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
632  d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
633  d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
634  d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
635  d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
636  d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
637  d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
638  d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \
639  d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \
640  d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \
641  d_out0_i = vadd_s16 (d_s0_i, d_s2_i);
642 
643 #define RADIX4x4_WITHOUT_TW_S0_SCALED \
644  d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
645  d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
646  d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
647  d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
648  d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
649  d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
650  d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
651  d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
652  d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \
653  d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \
654  d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \
655  d_out0_i = vhadd_s16 (d_s0_i, d_s2_i);
656 
657 
658 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
659  ne10_fft_cpx_int16_t * Fin,
660  ne10_int32_t stride)
661 {
662  RADIX4x4_WITHOUT_TW_START
663 
664  for (f_count = 0; f_count < stride; f_count += 4)
665  {
666  // load
667  RADIX4x4_WITHOUT_TW_LOAD
668 
669  // radix 4 butterfly without twiddles
670  RADIX4x4_WITHOUT_TW_S0
671 
672  d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
673  d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
674  d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
675  d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
676 
677  RADIX4x4_WITHOUT_TW_STORE
678  }
679 }
680 
681 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
682  ne10_fft_cpx_int16_t * Fin,
683  ne10_int32_t stride)
684 {
685  RADIX4x4_WITHOUT_TW_START
686 
687  for (f_count = 0; f_count < stride; f_count += 4)
688  {
689  // load
690  RADIX4x4_WITHOUT_TW_LOAD
691 
692  // radix 4 butterfly without twiddles
693  RADIX4x4_WITHOUT_TW_S0
694 
695  d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
696  d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
697  d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
698  d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
699 
700  RADIX4x4_WITHOUT_TW_STORE
701  }
702 }
703 
704 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
705  ne10_fft_cpx_int16_t * Fin,
706  ne10_int32_t stride)
707 {
708  RADIX4x4_WITHOUT_TW_START
709 
710  for (f_count = 0; f_count < stride; f_count += 4)
711  {
712  // load
713  RADIX4x4_WITHOUT_TW_LOAD
714 
715  // radix 4 butterfly without twiddles
716  RADIX4x4_WITHOUT_TW_S0_SCALED
717 
718  d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
719  d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
720  d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
721  d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
722 
723  RADIX4x4_WITHOUT_TW_STORE
724  }
725 }
726 
727 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
728  ne10_fft_cpx_int16_t * Fin,
729  ne10_int32_t stride)
730 {
731  RADIX4x4_WITHOUT_TW_START
732 
733  for (f_count = 0; f_count < stride; f_count += 4)
734  {
735  // load
736  RADIX4x4_WITHOUT_TW_LOAD
737 
738  // radix 4 butterfly without twiddles
739  RADIX4x4_WITHOUT_TW_S0_SCALED
740 
741  d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
742  d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
743  d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
744  d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
745 
746  RADIX4x4_WITHOUT_TW_STORE
747  }
748 }
749 
750 #define RADIX4x4_WITH_TW_START \
751  ne10_int32_t m_count; \
752  ne10_int32_t src_step = src_stride << 1; \
753  ne10_int32_t dst_step = dst_stride << 1; \
754  ne10_int32_t tw_step = mstride << 1; \
755  int16_t *p_src, *p_dst, *p_tw; \
756  int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
757  int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \
758  int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
759  int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \
760  int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \
761  int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
762  p_src = (int16_t *) Fin; \
763  p_dst = (int16_t *) Fout; \
764  p_tw = (int16_t *) tw;
765 
766 #define RADIX4x4_WITH_TW_LOAD \
767  d2_in0 = vld2_s16 (p_src); \
768  p_src += src_step; \
769  d2_in1 = vld2_s16 (p_src); \
770  p_src += src_step; \
771  d2_in2 = vld2_s16 (p_src); \
772  p_src += src_step; \
773  d2_in3 = vld2_s16 (p_src); \
774  p_src += src_step; \
775  d2_tw0 = vld2_s16 (p_tw); \
776  p_tw += tw_step; \
777  d2_tw1 = vld2_s16 (p_tw); \
778  p_tw += tw_step; \
779  d2_tw2 = vld2_s16 (p_tw); \
780  d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \
781  d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \
782  d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \
783  d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \
784  d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \
785  d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \
786  d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \
787  d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \
788  d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \
789  d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \
790  d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \
791  d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]);
792 
793 #define RADIX4x4_WITH_TW_STORE \
794  vst2_s16 (p_dst, d2_out0); \
795  p_dst += dst_step; \
796  vst2_s16 (p_dst, d2_out1); \
797  p_dst += dst_step; \
798  vst2_s16 (p_dst, d2_out2); \
799  p_dst += dst_step; \
800  vst2_s16 (p_dst, d2_out3); \
801  p_dst += dst_step; \
802  p_src = p_src - src_step * 4 + 8; \
803  p_dst = p_dst - dst_step * 4 + 8; \
804  p_tw = p_tw - tw_step * 2 + 8;
805 
806 #define RADIX4x4_WITH_TW_S1_FWD \
807  d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \
808  d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \
809  d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \
810  d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \
811  d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \
812  d_s3_i = vadd_s16 (d_s3_i, d_tmp5);
813 
814 #define RADIX4x4_WITH_TW_S1_INV \
815  d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \
816  d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \
817  d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \
818  d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \
819  d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \
820  d_s3_i = vsub_s16 (d_s3_i, d_tmp5);
821 
822 
823 #define RADIX4x4_WITH_TW_LS_02 \
824  d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \
825  d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \
826  d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \
827  d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \
828  d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \
829  d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \
830  d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \
831  d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \
832  d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \
833  d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \
834  d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \
835  d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i);
836 
837 #define RADIX4x4_WITH_TW_LS_02_SCALED \
838  d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \
839  d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \
840  d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \
841  d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \
842  d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \
843  d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \
844  d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \
845  d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \
846  d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \
847  d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \
848  d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \
849  d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i);
850 
851 
852 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
853  ne10_fft_cpx_int16_t * Fin,
855  ne10_int32_t src_stride,
856  ne10_int32_t dst_stride,
857  ne10_int32_t mstride)
858 {
859  RADIX4x4_WITH_TW_START
860 
861  for (m_count = 0; m_count < mstride; m_count += 4)
862  {
863  // load
864  RADIX4x4_WITH_TW_LOAD
865  RADIX4x4_WITH_TW_S1_FWD
866 
867  RADIX4x4_WITH_TW_LS_02
868 
869  d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
870  d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
871  d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
872  d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
873 
874  // store
875  RADIX4x4_WITH_TW_STORE
876  }
877 }
878 
879 
880 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
881  ne10_fft_cpx_int16_t * Fin,
883  ne10_int32_t src_stride,
884  ne10_int32_t dst_stride,
885  ne10_int32_t mstride)
886 {
887  RADIX4x4_WITH_TW_START
888 
889  for (m_count = 0; m_count < mstride; m_count += 4)
890  {
891  // load
892  RADIX4x4_WITH_TW_LOAD
893  RADIX4x4_WITH_TW_S1_INV
894 
895  RADIX4x4_WITH_TW_LS_02
896 
897  d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
898  d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
899  d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
900  d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
901 
902  // store
903  RADIX4x4_WITH_TW_STORE
904  }
905 }
906 
907 
908 
909 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
910  ne10_fft_cpx_int16_t * Fin,
912  ne10_int32_t src_stride,
913  ne10_int32_t dst_stride,
914  ne10_int32_t mstride)
915 {
916  RADIX4x4_WITH_TW_START
917 
918  for (m_count = 0; m_count < mstride; m_count += 4)
919  {
920  // load
921  RADIX4x4_WITH_TW_LOAD
922  RADIX4x4_WITH_TW_S1_FWD
923 
924  RADIX4x4_WITH_TW_LS_02_SCALED
925 
926  d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
927  d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
928  d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
929  d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
930 
931  // store
932  RADIX4x4_WITH_TW_STORE
933  }
934 }
935 
936 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
937  ne10_fft_cpx_int16_t * Fin,
939  ne10_int32_t src_stride,
940  ne10_int32_t dst_stride,
941  ne10_int32_t mstride)
942 {
943  RADIX4x4_WITH_TW_START
944 
945  for (m_count = 0; m_count < mstride; m_count += 4)
946  {
947  // load
948  RADIX4x4_WITH_TW_LOAD
949  RADIX4x4_WITH_TW_S1_INV
950 
951  RADIX4x4_WITH_TW_LS_02_SCALED
952 
953  d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
954  d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
955  d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
956  d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
957 
958  // store
959  RADIX4x4_WITH_TW_STORE
960  }
961 }
962 
963 
964 #define ne10_mixed_radix_fft_forward_int16_neon(scaled) \
965 void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
966  ne10_fft_cpx_int16_t * Fin, \
967  ne10_int32_t * factors, \
968  ne10_fft_cpx_int16_t * twiddles, \
969  ne10_fft_cpx_int16_t * buffer) \
970 { \
971  ne10_int32_t fstride, mstride, N; \
972  ne10_int32_t fstride1; \
973  ne10_int32_t f_count; \
974  ne10_int32_t stage_count; \
975  \
976  ne10_fft_cpx_int16_t *Fin1, *Fout1; \
977  ne10_fft_cpx_int16_t *Fout_ls = Fout; \
978  ne10_fft_cpx_int16_t *Ftmp; \
979  ne10_fft_cpx_int16_t *tw, *tw1; \
980  \
981  /* init fstride, mstride, N */ \
982  stage_count = factors[0]; \
983  fstride = factors[1]; \
984  mstride = factors[ (stage_count << 1) - 1 ]; \
985  N = factors[ stage_count << 1 ]; \
986  \
987  /* the first stage */ \
988  Fin1 = Fin; \
989  Fout1 = Fout; \
990  if (N == 2) \
991  { \
992  N = fstride >> 1;\
993  tw = twiddles; \
994  fstride1 = fstride >> 2; \
995  ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\
996  \
997  tw += 6; \
998  mstride <<= 2; \
999  fstride >>= 4; \
1000  stage_count -= 2; \
1001  \
1002  Ftmp = Fin; \
1003  Fin = Fout; \
1004  Fout = Ftmp; \
1005  } \
1006  else if (N == 4) \
1007  { \
1008  ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1009  N = fstride; \
1010  Ftmp = Fin; \
1011  Fin = Fout; \
1012  Fout = Ftmp; \
1013  /* update address for other stages*/ \
1014  stage_count--; \
1015  tw = twiddles; \
1016  fstride >>= 2; \
1017  } \
1018  /* others but the last one*/ \
1019  for (; stage_count > 1 ; stage_count--) \
1020  { \
1021  Fin1 = Fin; \
1022  for (f_count = 0; f_count < fstride; f_count ++) \
1023  { \
1024  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1025  tw1 = tw; \
1026  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1027  Fin1 += mstride; \
1028  } \
1029  tw += mstride * 3; \
1030  mstride <<= 2; \
1031  Ftmp = Fin; \
1032  Fin = Fout; \
1033  Fout = Ftmp; \
1034  fstride >>= 2; \
1035  }\
1036  /* the last one*/ \
1037  if (stage_count) \
1038  { \
1039  Fin1 = Fin; \
1040  Fout1 = Fout_ls; \
1041  for (f_count = 0; f_count < fstride; f_count ++) \
1042  { \
1043  tw1 = tw; \
1044  ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1045  Fin1 += mstride; \
1046  Fout1 += mstride; \
1047  } \
1048  } \
1049 }
1050 
1051 #define ne10_mixed_radix_fft_backward_int16_neon(scaled) \
1052 void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
1053  ne10_fft_cpx_int16_t * Fin, \
1054  ne10_int32_t * factors, \
1055  ne10_fft_cpx_int16_t * twiddles, \
1056  ne10_fft_cpx_int16_t * buffer) \
1057 { \
1058  ne10_int32_t fstride, mstride, N; \
1059  ne10_int32_t fstride1; \
1060  ne10_int32_t f_count; \
1061  ne10_int32_t stage_count; \
1062  \
1063  ne10_fft_cpx_int16_t *Fin1, *Fout1; \
1064  ne10_fft_cpx_int16_t *Fout_ls = Fout; \
1065  ne10_fft_cpx_int16_t *Ftmp; \
1066  ne10_fft_cpx_int16_t *tw, *tw1; \
1067  \
1068  /* init fstride, mstride, N */ \
1069  stage_count = factors[0]; \
1070  fstride = factors[1]; \
1071  mstride = factors[ (stage_count << 1) - 1 ]; \
1072  N = factors[ stage_count << 1 ]; \
1073  \
1074  /* the first stage */ \
1075  Fin1 = Fin; \
1076  Fout1 = Fout; \
1077  if (N == 2) \
1078  { \
1079  N = fstride >> 1;\
1080  tw = twiddles; \
1081  fstride1 = fstride >> 2; \
1082  ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\
1083  \
1084  tw += 6; \
1085  mstride <<= 2; \
1086  fstride >>= 4; \
1087  stage_count -= 2; \
1088  \
1089  Ftmp = Fin; \
1090  Fin = Fout; \
1091  Fout = Ftmp; \
1092  } \
1093  else if (N == 4) \
1094  { \
1095  ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1096  N = fstride; \
1097  Ftmp = Fin; \
1098  Fin = Fout; \
1099  Fout = Ftmp; \
1100  /* update address for other stages*/ \
1101  stage_count--; \
1102  tw = twiddles; \
1103  fstride >>= 2; \
1104  } \
1105  /* others but the last one*/ \
1106  for (; stage_count > 1 ; stage_count--) \
1107  { \
1108  Fin1 = Fin; \
1109  for (f_count = 0; f_count < fstride; f_count ++) \
1110  { \
1111  Fout1 = & Fout[ f_count * mstride << 2 ]; \
1112  tw1 = tw; \
1113  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1114  Fin1 += mstride; \
1115  } \
1116  tw += mstride * 3; \
1117  mstride <<= 2; \
1118  Ftmp = Fin; \
1119  Fin = Fout; \
1120  Fout = Ftmp; \
1121  fstride >>= 2; \
1122  }\
1123  /* the last one*/ \
1124  if (stage_count) \
1125  { \
1126  Fin1 = Fin; \
1127  Fout1 = Fout_ls; \
1128  for (f_count = 0; f_count < fstride; f_count ++) \
1129  { \
1130  tw1 = tw; \
1131  ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1132  Fin1 += mstride; \
1133  Fout1 += mstride; \
1134  } \
1135  } \
1136 }
1137 
1138 
1139 ne10_mixed_radix_fft_forward_int16_neon (unscaled)
1140 ne10_mixed_radix_fft_forward_int16_neon (scaled)
1141 ne10_mixed_radix_fft_backward_int16_neon (unscaled)
1142 ne10_mixed_radix_fft_backward_int16_neon (scaled)
1143 
1144 
1145 static void ne10_fft_split_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1146  const ne10_fft_cpx_int16_t *src,
1147  ne10_fft_cpx_int16_t *twiddles,
1148  ne10_int32_t ncfft,
1149  ne10_int32_t scaled_flag)
1150 {
1151  ne10_int32_t k;
1152  ne10_int32_t count = ncfft / 2;
1153  ne10_fft_cpx_int16_t fpnk, fpk, f1k, f2k, tw, tdc;
1154  int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1155  int16x8_t q_fpnk_r, q_fpnk_i;
1156  int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1157  int16x8_t q_tw_r, q_tw_i;
1158  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1159  int16x8_t q_dst2_r, q_dst2_i;
1160  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1161 
1162  tdc.r = src[0].r;
1163  tdc.i = src[0].i;
1164 
1165  if (scaled_flag)
1166  NE10_F2I16_FIXDIV (tdc, 2);
1167 
1168  dst[0].r = tdc.r + tdc.i;
1169  dst[ncfft].r = tdc.r - tdc.i;
1170  dst[ncfft].i = dst[0].i = 0;
1171  if (count >= 8)
1172  {
1173 
1174  if (scaled_flag)
1175  {
1176  for (k = 1; k <= count ; k += 8)
1177  {
1178  p_src = (int16_t*) (& (src[k]));
1179  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1180  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1181  p_dst = (int16_t*) (& (dst[k]));
1182  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1183 
1184  q2_fpk = vld2q_s16 (p_src);
1185  q2_fpnk = vld2q_s16 (p_src2);
1186 
1187  q2_tw = vld2q_s16 (p_twiddles);
1188  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1189  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1190  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1191  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1192  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1193  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1194  q_fpnk_i = vnegq_s16 (q_fpnk_i);
1195 
1196  q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1197  q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1198 
1199  q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1200  q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1201 
1202  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1203  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1204  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1205  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1206  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1207  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1208 
1209  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1210  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1211  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1212  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1213  q_dst2_r = vrev32q_s16 (q_dst2_r);
1214  q_dst2_i = vrev32q_s16 (q_dst2_i);
1215  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1216  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1217  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1218  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1219  vst2q_s16 (p_dst, q2_dst);
1220  vst2q_s16 (p_dst2, q2_dst2);
1221 
1222  }
1223  }
1224  else
1225  {
1226  for (k = 1; k <= count ; k += 8)
1227  {
1228  p_src = (int16_t*) (& (src[k]));
1229  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1230  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1231  p_dst = (int16_t*) (& (dst[k]));
1232  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1233 
1234  q2_fpk = vld2q_s16 (p_src);
1235  q2_fpnk = vld2q_s16 (p_src2);
1236 
1237  q2_tw = vld2q_s16 (p_twiddles);
1238  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1239  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1240  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1241  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1242  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1243  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1244  q_fpnk_i = vnegq_s16 (q_fpnk_i);
1245 
1246  q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1247  q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1248 
1249  q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1250  q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1251 
1252  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1253  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1254  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1255  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1256  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1257  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1258 
1259  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1260  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1261  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1262  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1263  q_dst2_r = vrev32q_s16 (q_dst2_r);
1264  q_dst2_i = vrev32q_s16 (q_dst2_i);
1265  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1266  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1267  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1268  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1269  vst2q_s16 (p_dst, q2_dst);
1270  vst2q_s16 (p_dst2, q2_dst2);
1271 
1272  }
1273  }
1274  }
1275  else
1276  {
1277 
1278  for (k = 1; k <= ncfft / 2 ; ++k)
1279  {
1280  fpk = src[k];
1281  fpnk.r = src[ncfft - k].r;
1282  fpnk.i = - src[ncfft - k].i;
1283  if (scaled_flag)
1284  {
1285  NE10_F2I16_FIXDIV (fpk, 2);
1286  NE10_F2I16_FIXDIV (fpnk, 2);
1287  }
1288 
1289  f1k.r = fpk.r + fpnk.r;
1290  f1k.i = fpk.i + fpnk.i;
1291 
1292  f2k.r = fpk.r - fpnk.r;
1293  f2k.i = fpk.i - fpnk.i;
1294 
1295  tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
1296  - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1297  tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
1298  + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
1299 
1300  dst[k].r = (f1k.r + tw.r) >> 1;
1301  dst[k].i = (f1k.i + tw.i) >> 1;
1302  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1303  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1304  }
1305  }
1306 }
1307 
1308 static void ne10_fft_split_c2r_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1309  const ne10_fft_cpx_int16_t *src,
1310  ne10_fft_cpx_int16_t *twiddles,
1311  ne10_int32_t ncfft,
1312  ne10_int32_t scaled_flag)
1313 {
1314 
1315  ne10_int32_t k;
1316  ne10_int32_t count = ncfft / 2;
1317  ne10_fft_cpx_int16_t fk, fnkc, fek, fok, tmp;
1318  int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1319  int16x8_t q_fnkc_r, q_fnkc_i;
1320  int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1321  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1322  int16x8_t q_dst2_r, q_dst2_i;
1323  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1324 
1325 
1326  dst[0].r = src[0].r + src[ncfft].r;
1327  dst[0].i = src[0].r - src[ncfft].r;
1328 
1329  if (scaled_flag)
1330  NE10_F2I16_FIXDIV (dst[0], 2);
1331  if (count >= 8)
1332  {
1333  if (scaled_flag)
1334  {
1335  for (k = 1; k <= count ; k += 8)
1336  {
1337  p_src = (int16_t*) (& (src[k]));
1338  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1339  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1340  p_dst = (int16_t*) (& (dst[k]));
1341  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1342 
1343  q2_fk = vld2q_s16 (p_src);
1344  q2_fnkc = vld2q_s16 (p_src2);
1345  q2_tw = vld2q_s16 (p_twiddles);
1346  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1347  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1348  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1349  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1350  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1351  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1352  q_fnkc_i = vnegq_s16 (q_fnkc_i);
1353 
1354  q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1355  q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1356  q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1357  q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1358 
1359  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1360  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1361  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1362  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1363  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1364  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1365 
1366  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1367  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1368  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1369  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1370  q_dst2_r = vrev32q_s16 (q_dst2_r);
1371  q_dst2_i = vrev32q_s16 (q_dst2_i);
1372  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1373  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1374  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1375  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1376  vst2q_s16 (p_dst, q2_dst);
1377  vst2q_s16 (p_dst2, q2_dst2);
1378 
1379  }
1380 
1381  }
1382  else
1383  {
1384  for (k = 1; k <= count ; k += 8)
1385  {
1386  p_src = (int16_t*) (& (src[k]));
1387  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1388  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1389  p_dst = (int16_t*) (& (dst[k]));
1390  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1391 
1392  q2_fk = vld2q_s16 (p_src);
1393  q2_fnkc = vld2q_s16 (p_src2);
1394  q2_tw = vld2q_s16 (p_twiddles);
1395  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1396  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1397  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1398  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1399  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1400  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1401  q_fnkc_i = vnegq_s16 (q_fnkc_i);
1402 
1403  q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1404  q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1405  q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1406  q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1407 
1408  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1409  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1410  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1411  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1412  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1413  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1414 
1415  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1416  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1417  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1418  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1419  q_dst2_r = vrev32q_s16 (q_dst2_r);
1420  q_dst2_i = vrev32q_s16 (q_dst2_i);
1421  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1422  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1423  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1424  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1425  vst2q_s16 (p_dst, q2_dst);
1426  vst2q_s16 (p_dst2, q2_dst2);
1427 
1428  }
1429  }
1430  }
1431  else
1432  {
1433 
1434  for (k = 1; k <= ncfft / 2; k++)
1435  {
1436  fk = src[k];
1437  fnkc.r = src[ncfft - k].r;
1438  fnkc.i = -src[ncfft - k].i;
1439  if (scaled_flag)
1440  {
1441  NE10_F2I16_FIXDIV (fk, 2);
1442  NE10_F2I16_FIXDIV (fnkc, 2);
1443  }
1444 
1445  fek.r = fk.r + fnkc.r;
1446  fek.i = fk.i + fnkc.i;
1447 
1448  tmp.r = fk.r - fnkc.r;
1449  tmp.i = fk.i - fnkc.i;
1450 
1451  fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
1452  + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1453  fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
1454  - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1455 
1456  dst[k].r = fek.r + fok.r;
1457  dst[k].i = fek.i + fok.i;
1458 
1459  dst[ncfft - k].r = fek.r - fok.r;
1460  dst[ncfft - k].i = fok.i - fek.i;
1461  }
1462  }
1463 }
1464 
1465 
1488  ne10_fft_cpx_int16_t *fin,
1490  ne10_int32_t inverse_fft,
1491  ne10_int32_t scaled_flag)
1492 {
1493  if (scaled_flag)
1494  {
1495  if (inverse_fft)
1496  {
1497  switch (cfg->nfft)
1498  {
1499  case 4:
1500  ne10_fft4_backward_int16_scaled (fout, fin);
1501  break;
1502  case 8:
1503  ne10_fft8_backward_int16_scaled (fout, fin);
1504  break;
1505  default:
1506  ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1507  break;
1508  }
1509  }
1510  else
1511  {
1512  switch (cfg->nfft)
1513  {
1514  case 4:
1515  ne10_fft4_forward_int16_scaled (fout, fin);
1516  break;
1517  case 8:
1518  ne10_fft8_forward_int16_scaled (fout, fin);
1519  break;
1520  default:
1521  ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1522  break;
1523  }
1524  }
1525  }
1526  else
1527  {
1528  if (inverse_fft)
1529  {
1530  switch (cfg->nfft)
1531  {
1532  case 4:
1533  ne10_fft4_backward_int16_unscaled (fout, fin);
1534  break;
1535  case 8:
1536  ne10_fft8_backward_int16_unscaled (fout, fin);
1537  break;
1538  default:
1539  ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1540  break;
1541  }
1542  }
1543  else
1544  {
1545  switch (cfg->nfft)
1546  {
1547  case 4:
1548  ne10_fft4_forward_int16_unscaled (fout, fin);
1549  break;
1550  case 8:
1551  ne10_fft8_forward_int16_unscaled (fout, fin);
1552  break;
1553  default:
1554  ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1555  break;
1556  }
1557  }
1558  }
1559 }
1560  //end of C2C_FFT_IFFT group
1564 
1585  ne10_int16_t *fin,
1587  ne10_int32_t scaled_flag)
1588 {
1589  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1590  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1591  ne10_fft_state_int16_t c2c_state;
1592 
1593  c2c_state.nfft = cfg->ncfft;
1594  c2c_state.factors = cfg->factors;
1595  c2c_state.twiddles = cfg->twiddles;
1596  c2c_state.buffer = tmpbuf2;
1597 
1598  ne10_fft_c2c_1d_int16_neon (tmpbuf1, (ne10_fft_cpx_int16_t*) fin, &c2c_state, 0, scaled_flag);
1599  ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1600 }
1601 
1616 void ne10_fft_c2r_1d_int16_neon (ne10_int16_t *fout,
1617  ne10_fft_cpx_int16_t *fin,
1619  ne10_int32_t scaled_flag)
1620 {
1621  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1622  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1623  ne10_fft_state_int16_t c2c_state;
1624 
1625  c2c_state.nfft = cfg->ncfft;
1626  c2c_state.factors = cfg->factors;
1627  c2c_state.twiddles = cfg->twiddles;
1628  c2c_state.buffer = tmpbuf2;
1629 
1630  ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1631  ne10_fft_c2c_1d_int16_neon ( (ne10_fft_cpx_int16_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1632 }
1633 
ne10_fft_c2c_1d_int16_neon
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
Definition: NE10_fft_int16.neonintrinsic.c:1487
ne10_fft_r2c_1d_int16_neon
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
Definition: NE10_fft_int16.neonintrinsic.c:1584
ne10_fft_c2r_1d_int16_neon
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
Definition: NE10_fft_int16.neonintrinsic.c:1616
ne10_fft_state_int16_t
Definition: NE10_types.h:303
ne10_fft_r2c_state_int16_t
Definition: NE10_types.h:313
ne10_fft_cpx_int16_t
structure for the 16 bits fixed point FFT function.
Definition: NE10_types.h:297