Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_rfft_float32.neonintrinsic.c
1 /*
2  * Copyright 2014-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_rfft_float32.neonintrinsic.c
45  */
46 
47 #include <arm_neon.h>
48 
49 #include "NE10_types.h"
50 #include "NE10_macros.h"
51 #include "NE10_fft.h"
52 #include "NE10_dsp.h"
53 #include "NE10_fft.neonintrinsic.h"
54 
55 NE10_INLINE void ne10_radix8x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
56  const ne10_fft_cpx_float32_t *Fin,
57  const ne10_int32_t fstride,
58  const ne10_int32_t mstride,
59  const ne10_int32_t nfft)
60 {
61  ne10_int32_t f_count;
62 
63  NE10_DECLARE_8(float32x4_t,q_in);
64  NE10_DECLARE_8(float32x4_t,q_out);
65 
66  const float32x4_t *Fin_neon = (float32x4_t*) Fin; // 8 x fstride
67  float32x4_t *Fout_neon = (float32x4_t*) Fout; // fstride x 8
68 
69  for (f_count = fstride; f_count > 0; f_count --)
70  {
71  // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = fstride
72  NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
73 
74  // print q_in0 ~ q_in7
75  // NE10_PRINT_Qx8_VECTOR(q_in);
76 
77  // do r2c fft, size = 8
78  NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
79 
80  // print q_out0 ~ q_out7
81  // NE10_PRINT_Qx8_VECTOR(q_out);
82 
83  // store q_out0 ~ q_out7 to Fout_neon, by step = 1
84  NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
85 
86  Fin_neon = Fin_neon - fstride * 8 + 1;
87  Fout_neon += 8; // next column
88  }
89 }
90 
91 NE10_INLINE void ne10_radix8x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
92  const ne10_fft_cpx_float32_t *Fin,
93  const ne10_int32_t fstride,
94  const ne10_int32_t mstride,
95  const ne10_int32_t nfft)
96 {
97  ne10_int32_t f_count;
98 
99  NE10_DECLARE_8(float32x4_t,q_in);
100  NE10_DECLARE_8(float32x4_t,q_out);
101 
102  const ne10_float32_t one_by_N = 0.25 / nfft;
103  const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
104 
105  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106  float32x4_t *Fout_neon = (float32x4_t*) Fout;
107 
108  for (f_count = fstride; f_count > 0; f_count --)
109  {
110  // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = 1
111  NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
112 
113  // NE10_PRINT_Qx8_VECTOR(q_in);
114 
115  NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
116 
117  // NE10_PRINT_Qx8_VECTOR(q_out);
118 
119 #ifdef NE10_DSP_RFFT_SCALING
120  q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121  q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122  q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123  q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124  q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125  q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126  q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127  q_out7 = vmulq_f32(q_out7,one_by_N_neon);
128 #endif
129 
130  // store
131  NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
132 
133  Fout_neon ++;
134  }
135 }
136 
137 NE10_INLINE void ne10_radix4x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
138  const ne10_fft_cpx_float32_t *Fin,
139  const ne10_int32_t fstride,
140  const ne10_int32_t mstride,
141  const ne10_int32_t nfft)
142 {
143  ne10_int32_t f_count;
144 
145  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146  float32x4_t *Fout_neon = (float32x4_t*) Fout;
147 
148  for (f_count = 0; f_count < fstride; f_count ++)
149  {
150  NE10_DECLARE_4(float32x4_t,q_in);
151  NE10_DECLARE_4(float32x4_t,q_out);
152 
153  // load
154  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
155 
156  NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
157 
158  // store
159  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
160 
161  Fin_neon = Fin_neon - 4*fstride + 1;
162  Fout_neon += 4;
163  }
164 }
165 
166 NE10_INLINE void ne10_radix4x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
167  const ne10_fft_cpx_float32_t *Fin,
168  const ne10_int32_t fstride,
169  const ne10_int32_t mstride,
170  const ne10_int32_t nfft)
171 {
172  ne10_int32_t f_count;
173 
174  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175  float32x4_t *Fout_neon = (float32x4_t*) Fout;
176 
177  const ne10_float32_t one_by_N = 0.25 / nfft;
178  const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
179 
180  for (f_count = 0; f_count < fstride; f_count ++)
181  {
182  NE10_DECLARE_4(float32x4_t,q_in);
183  NE10_DECLARE_4(float32x4_t,q_out);
184 
185  // load
186  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
187 
188  // NE10_PRINT_Qx4_VECTOR(q_in);
189 
190  NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
191 
192  // NE10_PRINT_Qx4_VECTOR(q_out);
193 
194 #ifdef NE10_DSP_RFFT_SCALING
195  q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196  q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197  q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198  q_out3 = vmulq_f32(q_out3,one_by_N_neon);
199 #endif
200 
201  // store
202  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
203 
204  Fout_neon ++;
205  }
206 }
207 
208 NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
209  const float32x4_t *Fin_neon,
210  const ne10_int32_t out_step,
211  const ne10_int32_t in_step,
212  const ne10_fft_cpx_float32_t *twiddles)
213 {
214  NE10_DECLARE_4(float32x4_t,q_in);
215  NE10_DECLARE_4(float32x4_t,q_out);
216 
217  // load
218  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
219 
220  NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
221 
222  // store
223  vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
227 }
228 
229 NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
230  const float32x4_t *Fin_neon,
231  const ne10_int32_t out_step,
232  const ne10_int32_t in_step,
233  const ne10_fft_cpx_float32_t *twiddles)
234 {
235  NE10_DECLARE_4(float32x4_t,q_in);
236  NE10_DECLARE_4(float32x4_t,q_out);
237 
238  // load
239  q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240  q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241  q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242  q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
243 
244  // NE10_PRINT_Qx4_VECTOR(q_in);
245 
246  NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
247 
248  // NE10_PRINT_Qx4_VECTOR(q_out);
249 
250  // store
251  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
252 }
253 
254 NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
255  const float32x4_t *Fin_neon,
256  const ne10_int32_t out_step,
257  const ne10_int32_t in_step,
258  const ne10_fft_cpx_float32_t *twiddles)
259 {
260  ne10_int32_t m_count;
261  ne10_int32_t loop_count = (out_step>>1) -1;
262  float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2; // reversed
263 
264  NE10_DECLARE_3(float32x4x2_t,q2_tw);
265  NE10_DECLARE_4(float32x4x2_t,q2_in);
266  NE10_DECLARE_4(float32x4x2_t,q2_out);
267 
268  for (m_count = loop_count; m_count > 0; m_count -- )
269  {
270 #ifndef NE10_INLINE_ASM_OPT
271  // load
272  q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273  q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
274 
275  q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276  q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
277 
278  q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279  q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
280 
281  q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282  q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
283 
284  q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285  q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
286 
287  q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288  q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
289 
290  q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291  q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
292 
293  // R2C TW KERNEL
294  NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
295 #else // NE10_INLINE_ASM_OPT
296 #ifndef __aarch64__
297 #error Currently, inline assembly optimizations are only available on AArch64.
298 #else // __aarch64__
299  const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
300  const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
301  asm volatile (
302  "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303  "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304  "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305  "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306  "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307  "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
308 
309  "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
310  "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
311  "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t" // RR - II
312  "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t" // RI + IR
313 
314  "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t" // RR
315  "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t" // RI
316  "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t" // RR - II
317  "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t" // RI + IR
318 
319  "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t" // RR
320  "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t" // RI
321  "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t" // RR - II
322  "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t" // RI + IR
323  : [q2_out0r]"+w"(q2_out0.val[0]),
324  [q2_out0i]"+w"(q2_out0.val[1]),
325  [q2_out1r]"+w"(q2_out1.val[0]),
326  [q2_out1i]"+w"(q2_out1.val[1]),
327  [q2_out2r]"+w"(q2_out2.val[0]),
328  [q2_out2i]"+w"(q2_out2.val[1]),
329  [q2_out3r]"+w"(q2_out3.val[0]),
330  [q2_out3i]"+w"(q2_out3.val[1]),
331  [ptr_inr]"+r"(ptr_inr),
332  [ptr_ini]"+r"(ptr_ini)
333  : [offset_in]"r"(in_step * 16),
334  [ptr_tw]"r"(twiddles)
335  : "memory", "v0", "v1", "v2",
336  "v10", "v11", "v12", "v13", "v14", "v15"
337  );
338 #endif // __aarch64__
339 #endif // NE10_INLINE_ASM_OPT
340 
341  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
343 
344  // store
345  vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346  vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
347 
348  vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349  vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
350 
351  vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352  vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
353 
354  vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355  vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
356 
357  // update pointers
358  Fin_neon += 2;
359  Fout_neon += 2;
360  Fout_b -= 2;
361  twiddles += 3;
362  }
363 }
364 
365 NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
366  const float32x4_t *Fin_neon,
367  const ne10_int32_t out_step,
368  const ne10_int32_t in_step,
369  const ne10_fft_cpx_float32_t *twiddles)
370 {
371  ne10_int32_t m_count;
372  ne10_int32_t loop_count = (out_step>>1) -1;
373  const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2; // reversed
374 
375  NE10_DECLARE_3(float32x4x2_t,q2_tw);
376  NE10_DECLARE_4(float32x4x2_t,q2_in);
377  NE10_DECLARE_4(float32x4x2_t,q2_out);
378 
379  for (m_count = loop_count; m_count > 0; m_count -- )
380  {
381  // load
382  q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383  q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
384 
385  q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386  q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
387 
388  q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389  q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
390 
391  q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392  q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
393 
394  q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395  q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
396 
397  q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398  q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
399 
400  q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401  q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
402 
403  // NE10_PRINT_Q2x4_VECTOR(q2_in);
404 
405  // R2C TW KERNEL
406  NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
407 
408  // NE10_PRINT_Q2x4_VECTOR(q2_out);
409 
410  // store
411  vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412  vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
413 
414  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
416 
417  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418  vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
419 
420  vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421  vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
422 
423  // update pointers
424  Fin_neon += 2;
425  Fout_neon += 2;
426  Fin_b -= 2;
427  twiddles += 3;
428  }
429 }
430 
431 NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
432  const float32x4_t *Fin_neon,
433  const ne10_int32_t out_step,
434  const ne10_int32_t in_step,
435  const ne10_fft_cpx_float32_t *twiddles)
436 {
437  NE10_DECLARE_4(float32x4_t,q_in);
438  NE10_DECLARE_4(float32x4_t,q_out);
439 
440  // load
441  NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
442 
443  NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(q_out,q_in);
444 
445  // store
446  vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
447  vst1q_f32( (ne10_float32_t*) (Fout_neon + 1), q_out1);
448  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
449  vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
450 }
451 
452 NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
453  const float32x4_t *Fin_neon,
454  const ne10_int32_t out_step,
455  const ne10_int32_t in_step,
456  const ne10_fft_cpx_float32_t *twiddles)
457 {
458  NE10_DECLARE_4(float32x4_t,q_in);
459  NE10_DECLARE_4(float32x4_t,q_out);
460 
461  // load
462  q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
463  q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1) );
464  q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
465  q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
466 
467  // NE10_PRINT_Qx4_VECTOR(q_in);
468 
469  NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(q_out,q_in);
470 
471  // NE10_PRINT_Qx4_VECTOR(q_out);
472 
473  // store
474  NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
475 }
476 
477 NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
478  const ne10_fft_cpx_float32_t *Fin,
479  const ne10_int32_t fstride,
480  const ne10_int32_t mstride,
481  const ne10_int32_t nfft,
482  const ne10_fft_cpx_float32_t *twiddles)
483 {
484  ne10_int32_t f_count;
485  const ne10_int32_t in_step = nfft >> 2;
486  const ne10_int32_t out_step = mstride;
487 
488  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
489  float32x4_t *Fout_neon = (float32x4_t*) Fout;
490  const ne10_fft_cpx_float32_t *tw;
491 
492  for (f_count = fstride; f_count; f_count --)
493  {
494  tw = twiddles + 3;
495 
496  // first butterfly
497  ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
498 
499  Fin_neon ++;
500  Fout_neon ++;
501 
502  // other butterfly
503  // Twiddle tables are transposed to avoid memory access by a large stride.
504  ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
505 
506  // update Fin_r, Fout_r, twiddles
507  Fin_neon += 2 * ( (out_step >> 1) - 1);
508  Fout_neon += 2 * ( (out_step >> 1) - 1);
509 
510  // last butterfly
511  ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
512  Fin_neon ++;
513  Fout_neon ++;
514 
515  Fout_neon = Fout_neon + 3 * out_step;
516  } // f_count
517 }
518 
519 NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
520  const ne10_fft_cpx_float32_t *Fin,
521  const ne10_int32_t fstride,
522  const ne10_int32_t mstride,
523  const ne10_int32_t nfft,
524  const ne10_fft_cpx_float32_t *twiddles)
525 {
526  ne10_int32_t f_count;
527  const ne10_int32_t in_step = nfft >> 2;
528  const ne10_int32_t out_step = mstride;
529 
530  const float32x4_t *Fin_neon = (float32x4_t*) Fin;
531  float32x4_t *Fout_neon = (float32x4_t*) Fout;
532  const ne10_fft_cpx_float32_t *tw;
533 
534  for (f_count = fstride; f_count; f_count --)
535  {
536  tw = twiddles + 3;
537 
538  // first butterfly
539  ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
540 
541  Fin_neon ++;
542  Fout_neon ++;
543 
544  // other butterfly
545  // Twiddle tables are transposed to avoid memory access by a large stride.
546  ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
547 
548  // update Fin_r, Fout_r, twiddles
549  Fin_neon += 2 * ( (out_step >> 1) - 1);
550  Fout_neon += 2 * ( (out_step >> 1) - 1);
551 
552  // last butterfly
553  ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
554  Fin_neon ++;
555  Fout_neon ++;
556 
557  Fin_neon = Fin_neon + 3 * out_step;
558  } // f_count
559 }
560 
561 NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
562  const ne10_fft_cpx_float32_t * Fin,
563  const ne10_int32_t * factors,
564  const ne10_fft_cpx_float32_t * twiddles,
565  ne10_fft_cpx_float32_t * buffer)
566 {
567  ne10_int32_t fstride, mstride, nfft;
568  ne10_int32_t radix;
569  ne10_int32_t stage_count;
570 
571  // PRINT_STAGE_INFO;
572 
573  // init fstride, mstride, radix, nfft
574  stage_count = factors[0];
575  fstride = factors[1];
576  mstride = factors[ (stage_count << 1) - 1 ];
577  radix = factors[ stage_count << 1 ];
578  nfft = radix * fstride; // not the real nfft
579 
580  // PRINT_STAGE_INFO;
581 
582  if (radix == 2)
583  {
584  // combine one radix-4 and one radix-2 into one radix-8
585  mstride <<= 2;
586  fstride >>= 2;
587  twiddles += 6;
588  stage_count --;
589  }
590 
591  if (stage_count % 2 == 1) // since there is another stage outside
592  {
593  ne10_swap_ptr (buffer, Fout);
594  }
595 
596  // the first stage
597  if (radix == 2) // length of FFT is 2^n (n is odd)
598  {
599  ne10_radix8x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
600  }
601  else if (radix == 4) // length of FFT is 2^n (n is even)
602  {
603  ne10_radix4x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
604  }
605  // end of first stage
606 
607  // others
608  for (; fstride > 1;)
609  {
610  fstride >>= 2;
611  ne10_swap_ptr (buffer, Fout);
612 
613  ne10_radix4x4_r2c_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
614  twiddles += 3 * mstride;
615  mstride <<= 2;
616  } // other stage
617 }
618 
619 NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
620  const ne10_fft_cpx_float32_t * Fin,
621  const ne10_int32_t * factors,
622  const ne10_fft_cpx_float32_t * twiddles,
623  ne10_fft_cpx_float32_t * buffer)
624 {
625  ne10_int32_t fstride, mstride, nfft;
626  ne10_int32_t radix;
627  ne10_int32_t stage_count;
628 
629  // PRINT_STAGE_INFO;
630 
631  // init fstride, mstride, radix, nfft
632  stage_count = factors[0];
633  fstride = factors[1];
634 
635  mstride = factors[ (stage_count << 1) - 1 ];
636  radix = factors[ stage_count << 1 ];
637  nfft = radix * fstride; // not the real nfft
638 
639  // fstride, mstride for last last stage
640  fstride = 1;
641  mstride = nfft >> 2;
642 
643  if (radix == 2)
644  {
645  // combine one radix-4 and one radix-2 into one radix-8
646  stage_count --;
647  }
648 
649  if (stage_count % 2 == 0)
650  {
651  ne10_swap_ptr(Fout,buffer);
652  }
653 
654  // others but the first stage
655  for (; stage_count > 1;)
656  {
657  twiddles -= 3 * mstride;
658 
659  // PRINT_STAGE_INFO;
660  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
661  ne10_radix4x4_c2r_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
662 
663  fstride <<= 2;
664  mstride >>= 2;
665  stage_count --;
666  ne10_swap_ptr (buffer, Fout);
667  }
668 
669  // first stage -- inversed
670  if (radix == 2) // length of FFT is 2^n (n is odd)
671  {
672  mstride >>= 1;
673 
674  // PRINT_STAGE_INFO;
675  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
676  ne10_radix8x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
677  }
678  else if (radix == 4) // length of FFT is 2^n (n is even)
679  {
680  // PRINT_STAGE_INFO;
681  // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
682  ne10_radix4x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
683  }
684 }
685 
686 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
687  const ne10_fft_cpx_float32_t *src,
688  const ne10_fft_cpx_float32_t *twiddles,
689  const ne10_int32_t nfft)
690 {
691  // b0
692  {
693  ne10_float32_t q_4r_out[4];
694  const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
695 
696  NE10_FFT_R2C_4R_RCR(q_4r_out,p_src_r);
697 
698  dst[0].r = q_4r_out[0];
699  dst[0].i = q_4r_out[3];
700  dst += (nfft>>2);
701  dst[0].r = q_4r_out[1];
702  dst[0].i = q_4r_out[2];
703  dst -= (nfft>>2);
704  }
705 
706  // b2
707  {
708  const ne10_float32_t *p_src_r = (const ne10_float32_t*) (src);
709  p_src_r += nfft;
710  p_src_r -= 4;
711 
712  ne10_float32_t q_4r_out[4];
713 
714  NE10_FFT_R2C_4R_CC(q_4r_out,p_src_r);
715 
716  dst += (nfft>>3);
717  dst[0].r = q_4r_out[0];
718  dst[0].i = q_4r_out[1];
719  dst += (nfft>>2);
720  dst[0].r = q_4r_out[2];
721  dst[0].i = q_4r_out[3];
722  dst -= (nfft>>3);
723  dst -= (nfft>>2);
724  }
725 
726  // b1
727  ne10_fft_cpx_float32_t cc_out[4];
728  ne10_fft_cpx_float32_t cc_in [4];
729  const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
730  p_src_r += 4;
731 
732  cc_out[0].r = *(p_src_r ++);
733  cc_out[1].r = *(p_src_r ++);
734  cc_out[2].r = *(p_src_r ++);
735  cc_out[3].r = *(p_src_r ++);
736 
737  cc_out[0].i = *(p_src_r ++);
738  cc_out[1].i = *(p_src_r ++);
739  cc_out[2].i = *(p_src_r ++);
740  cc_out[3].i = *(p_src_r ++);
741 
742  NE10_PRINT_Q2_VECTOR(cc_out);
743 
744  // twiddles[0] = ( 1.0, 0.0);
745  // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
746  cc_in[0] = cc_out[0];
747  twiddles ++;
748 
749  NE10_CPX_MUL_F32(cc_in[1],cc_out[1],twiddles[0]);
750  twiddles ++;
751 
752  NE10_CPX_MUL_F32(cc_in[2],cc_out[2],twiddles[0]);
753  twiddles ++;
754 
755  NE10_CPX_MUL_F32(cc_in[3],cc_out[3],twiddles[0]);
756 
757  // NE10_PRINT_Q2_VECTOR(cc_in);
758 
759  NE10_FFT_R2C_CC_CC(cc_out,cc_in);
760 
761  // NE10_PRINT_Q2_VECTOR(cc_out);
762 
763  dst[1] = cc_out[0];
764  dst += (nfft>>2);
765  dst[ 1] = cc_out[1];
766  dst[-1] = cc_out[3];
767  dst += (nfft>>2);
768  dst[-1] = cc_out[2];
769 }
770 
771 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
772  const ne10_fft_cpx_float32_t *src,
773  const ne10_fft_cpx_float32_t *twiddles,
774  const ne10_int32_t nfft)
775 {
776  // b0
777  {
778  ne10_float32_t q_4r_in[4];
779  ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
780 
781  q_4r_in[0] = src[0].r;
782  q_4r_in[3] = src[0].i;
783  src += (nfft>>2);
784  q_4r_in[1] = src[0].r;
785  q_4r_in[2] = src[0].i;
786  src -= (nfft>>2);
787 
788  NE10_FFT_C2R_RCR_4R(p_dst_r,q_4r_in);
789  }
790 
791  // b2
792  {
793  // float32x4_t q_in;
794  ne10_float32_t *p_dst_r = (ne10_float32_t*) (dst);
795  p_dst_r += nfft;
796  p_dst_r -= 4;
797 
798  ne10_float32_t q_4r_in[4];
799  src += (nfft>>3);
800  q_4r_in[0] = src[0].r;
801  q_4r_in[1] = src[0].i;
802  src += (nfft>>2);
803  q_4r_in[2] = src[0].r;
804  q_4r_in[3] = src[0].i;
805  src -= (nfft>>3);
806  src -= (nfft>>2);
807 
808  NE10_FFT_C2R_CC_4R(p_dst_r,q_4r_in);
809  }
810 
811  // b1
812  ne10_fft_cpx_float32_t cc_out[4];
813  ne10_fft_cpx_float32_t cc_in [4];
814  ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
815  p_dst_r += 4;
816 
817  // load
818  cc_out[0] = src[1];
819  src += (nfft>>2);
820  cc_out[2] = src[ 1];
821  cc_out[3] = src[-1];
822  src += (nfft>>2);
823  cc_out[1] = src[-1];
824 
825  // NE10_PRINT_Q2_VECTOR(cc_out);
826 
827  NE10_FFT_C2R_CC_CC(cc_in,cc_out);
828 
829  // NE10_PRINT_Q2_VECTOR(cc_in);
830 
831  // twiddles[0] = ( 1.0, 0.0);
832  // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
833  cc_out[0] = cc_in[0];
834  twiddles ++;
835 
836  NE10_CPX_CONJ_MUL_F32(cc_out[1],cc_in[1],twiddles[0]);
837  twiddles ++;
838 
839  NE10_CPX_CONJ_MUL_F32(cc_out[2],cc_in[2],twiddles[0]);
840  twiddles ++;
841 
842  NE10_CPX_CONJ_MUL_F32(cc_out[3],cc_in[3],twiddles[0]);
843 
844  // NE10_PRINT_Q2_VECTOR(cc_out);
845 
846  *(p_dst_r ++) = cc_out[0].r;
847  *(p_dst_r ++) = cc_out[1].r;
848  *(p_dst_r ++) = cc_out[2].r;
849  *(p_dst_r ++) = cc_out[3].r;
850 
851  *(p_dst_r ++) = cc_out[0].i;
852  *(p_dst_r ++) = cc_out[1].i;
853  *(p_dst_r ++) = cc_out[2].i;
854  *(p_dst_r ++) = cc_out[3].i;
855 }
856 
857 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
858  const ne10_fft_cpx_float32_t *src,
859  const ne10_fft_cpx_float32_t *twiddles,
860  const ne10_int32_t nfft)
861 {
862  // assert ( nfft % 4 == 0 );
863  const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 12;
864  ne10_float32_t *fout_r = (ne10_float32_t*) dst;
865  const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
866 
867  ne10_float32_t q_in0[4], q_out0[4],
868  q_in1[4], q_out1[4],
869  q_in2[4], q_out2[4],
870  q_in3[4], q_out3[4];
871 
872  ne10_float32_t q2_tw0[2][4],
873  q2_tw1[2][4];
874 
875  /* INPUT & OUTPUT
876  * 0R 1R 2R 3R Q0
877  * 0I 1I 2I 3I Q1
878  * 4R 5R 6R 7R Q2
879  * 4I 5I 6I 7I Q3
880  */
881 
882  q_in0[0] = *(fin_r++);
883  q_in0[1] = *(fin_r++);
884  q_in0[2] = *(fin_r++);
885  q_in0[3] = *(fin_r++);
886  q_in1[0] = *(fin_r++);
887  q_in1[1] = *(fin_r++);
888  q_in1[2] = *(fin_r++);
889  q_in1[3] = *(fin_r++);
890  q_in2[0] = *(fin_r++);
891  q_in2[1] = *(fin_r++);
892  q_in2[2] = *(fin_r++);
893  q_in2[3] = *(fin_r++);
894  q_in3[0] = *(fin_r++);
895  q_in3[1] = *(fin_r++);
896  q_in3[2] = *(fin_r++);
897  q_in3[3] = *(fin_r++);
898 
899  // NE10_PRINT_Q_VECTOR(q_in0);
900  // NE10_PRINT_Q_VECTOR(q_in1);
901  // NE10_PRINT_Q_VECTOR(q_in2);
902  // NE10_PRINT_Q_VECTOR(q_in3);
903 
904  q2_tw0[0][0] = tw[0];
905  q2_tw0[0][1] = tw[2];
906  q2_tw0[0][2] = tw[4];
907  q2_tw0[0][3] = tw[6];
908  q2_tw0[1][0] = tw[1];
909  q2_tw0[1][1] = tw[3];
910  q2_tw0[1][2] = tw[5];
911  q2_tw0[1][3] = tw[7];
912 
913  q2_tw1[0][0] = tw[0+8];
914  q2_tw1[0][1] = tw[2+8];
915  q2_tw1[0][2] = tw[4+8];
916  q2_tw1[0][3] = tw[6+8];
917  q2_tw1[1][0] = tw[1+8];
918  q2_tw1[1][1] = tw[3+8];
919  q2_tw1[1][2] = tw[5+8];
920  q2_tw1[1][3] = tw[7+8];
921 
922  // TW: in->out
923  q_out0[0] = q_in0[0];
924  q_out1[0] = q_in1[0];
925  q_out2[0] = q_in2[0];
926  q_out3[0] = q_in3[0];
927 
928  //----------------------------------------------------------//
929  // first 2 lines
930  // R R R I I
931  q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
932  // I R I I R
933  q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
934 
935  // R R R I I
936  q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
937  // I R I I R
938  q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
939 
940  // R R R I I
941  q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
942  // I R I I R
943  q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
944 
945  //---------------------------------------------------------//
946  // second 2 lines
947  // R R R I I
948  q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
949  // I R I I R
950  q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
951 
952  // R R R I I
953  q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
954  // I R I I R
955  q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
956 
957  // R R R I I
958  q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
959  // I R I I R
960  q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
961 
962  // NE10_PRINT_Q_VECTOR(q_out0);
963  // NE10_PRINT_Q_VECTOR(q_out1);
964  // NE10_PRINT_Q_VECTOR(q_out2);
965  // NE10_PRINT_Q_VECTOR(q_out3);
966 
967  // BUTTERFLY - radix 4x2
968  // STAGE
969  // q_out -> q_in
970  // R i R j R k
971  q_in0[0] = q_out0[0] + q_out0[2];
972  q_in1[0] = q_out1[0] + q_out1[2];
973 
974  q_in0[1] = q_out0[0] - q_out0[2];
975  q_in1[1] = q_out1[0] - q_out1[2];
976 
977  // R i R j R k
978  q_in0[2] = q_out0[1] + q_out0[3];
979  q_in1[2] = q_out1[1] + q_out1[3];
980 
981  q_in0[3] = q_out0[1] - q_out0[3];
982  q_in1[3] = q_out1[1] - q_out1[3];
983 
984  // R i R j R k
985  q_in2[0] = q_out2[0] + q_out2[2];
986  q_in3[0] = q_out3[0] + q_out3[2];
987 
988  q_in2[1] = q_out2[0] - q_out2[2];
989  q_in3[1] = q_out3[0] - q_out3[2];
990 
991  // R i R j R k
992  q_in2[2] = q_out2[1] + q_out2[3];
993  q_in3[2] = q_out3[1] + q_out3[3];
994 
995  q_in2[3] = q_out2[1] - q_out2[3];
996  q_in3[3] = q_out3[1] - q_out3[3];
997 
998  // NE10_PRINT_Q_VECTOR(q_in0);
999  // NE10_PRINT_Q_VECTOR(q_in1);
1000  // NE10_PRINT_Q_VECTOR(q_in2);
1001  // NE10_PRINT_Q_VECTOR(q_in3);
1002 
1003  // STAGE
1004  // q_in -> q_out
1005  // and transpose
1006  // R i R j R k
1007  q_out0[0] = q_in0[0] + q_in0[2];
1008  q_out0[1] = q_in1[0] + q_in1[2];
1009 
1010  q_out2[2] = q_in0[0] - q_in0[2];
1011  q_out2[3] = - q_in1[0] + q_in1[2];// CONJ
1012 
1013  // R i R j R k
1014  q_out3[2] = q_in0[1] - q_in1[3];
1015  q_out3[3] = - q_in1[1] - q_in0[3];// CONJ
1016 
1017  q_out1[0] = q_in0[1] + q_in1[3];
1018  q_out1[1] = q_in1[1] - q_in0[3];
1019 
1020  // R i R j R k
1021  q_out0[2] = q_in2[0] + q_in2[2];
1022  q_out0[3] = q_in3[0] + q_in3[2];
1023 
1024  q_out2[0] = q_in2[0] - q_in2[2];
1025  q_out2[1] = - q_in3[0] + q_in3[2];// CONJ
1026 
1027  // R i R j R k
1028  q_out3[0] = q_in2[1] - q_in3[3];
1029  q_out3[1] = - q_in3[1] - q_in2[3]; // CONJ
1030 
1031  q_out1[2] = q_in2[1] + q_in3[3];
1032  q_out1[3] = q_in3[1] - q_in2[3];
1033 
1034  // NE10_PRINT_Q_VECTOR(q_out0);
1035  // NE10_PRINT_Q_VECTOR(q_out1);
1036  // NE10_PRINT_Q_VECTOR(q_out2);
1037  // NE10_PRINT_Q_VECTOR(q_out3);
1038 
1039  // STORE
1040  fout_r += 4;
1041  fout_r[0] = q_out0[0];
1042  fout_r[1] = q_out0[1];
1043  fout_r[2] = q_out0[2];
1044  fout_r[3] = q_out0[3];
1045 
1046  fout_r += (nfft>>1);
1047  fout_r[0] = q_out1[0];
1048  fout_r[1] = q_out1[1];
1049  fout_r[2] = q_out1[2];
1050  fout_r[3] = q_out1[3];
1051 
1052  fout_r -= 10;
1053  fout_r[0] = q_out3[0];
1054  fout_r[1] = q_out3[1];
1055  fout_r[2] = q_out3[2];
1056  fout_r[3] = q_out3[3];
1057 
1058  fout_r += (nfft>>1);
1059  fout_r[0] = q_out2[0];
1060  fout_r[1] = q_out2[1];
1061  fout_r[2] = q_out2[2];
1062  fout_r[3] = q_out2[3];
1063 }
1064 
1065 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
1066  const ne10_fft_cpx_float32_t *src,
1067  const ne10_fft_cpx_float32_t *twiddles,
1068  const ne10_int32_t nfft)
1069 {
1070  const ne10_float32_t *fin_r = (const ne10_float32_t*) src;
1071  ne10_float32_t *fout_r = (ne10_float32_t*) dst + 12;
1072  const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
1073 
1074  ne10_float32_t q_in0[4], q_out0[4],
1075  q_in1[4], q_out1[4],
1076  q_in2[4], q_out2[4],
1077  q_in3[4], q_out3[4];
1078 
1079  ne10_float32_t q2_tw0[2][4],
1080  q2_tw1[2][4];
1081 
1082  /* INPUT & OUTPUT
1083  * 0R 1R 2R 3R Q0
1084  * 0I 1I 2I 3I Q1
1085  * 4R 5R 6R 7R Q2
1086  * 4I 5I 6I 7I Q3
1087  */
1088 
1089  // load
1090  fin_r += 4;
1091  q_in0[0] = fin_r[0];
1092  q_in0[1] = fin_r[1];
1093  q_in0[2] = fin_r[2];
1094  q_in0[3] = fin_r[3];
1095 
1096  fin_r += (nfft>>1);
1097  q_in1[0] = fin_r[0];
1098  q_in1[1] = fin_r[1];
1099  q_in1[2] = fin_r[2];
1100  q_in1[3] = fin_r[3];
1101 
1102  fin_r -= 10;
1103  q_in3[0] = fin_r[0];
1104  q_in3[1] = fin_r[1];
1105  q_in3[2] = fin_r[2];
1106  q_in3[3] = fin_r[3];
1107 
1108  fin_r += (nfft>>1);
1109  q_in2[0] = fin_r[0];
1110  q_in2[1] = fin_r[1];
1111  q_in2[2] = fin_r[2];
1112  q_in2[3] = fin_r[3];
1113 
1114  // NE10_PRINT_Q_VECTOR(q_in0);
1115  // NE10_PRINT_Q_VECTOR(q_in1);
1116  // NE10_PRINT_Q_VECTOR(q_in2);
1117  // NE10_PRINT_Q_VECTOR(q_in3);
1118 
1119  // OUTPUT
1120  // INPUT
1121 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1122  q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \
1123  q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \
1124 } while(0);
1125 
1126  // STAGE
1127  // q_in -> q_out
1128  // and transpose
1129  NE10_INV_BUTTERFLY_TMP( 0,0, 0,2,
1130  0,0, 2,2);
1131 
1132  NE10_INV_BUTTERFLY_TMP( 1,2, 1,0,
1133  0,1, 2,3);
1134 
1135  NE10_INV_BUTTERFLY_TMP( 0,1, 1,3,
1136  1,0, 3,2);
1137 
1138  q_in3[3] *= - 1.0f;
1139  NE10_INV_BUTTERFLY_TMP( 1,1, 0,3,
1140  3,3, 1,1);
1141 
1142  NE10_INV_BUTTERFLY_TMP( 2,0, 2,2,
1143  0,2, 2,0);
1144 
1145  NE10_INV_BUTTERFLY_TMP( 3,2, 3,0,
1146  0,3, 2,1);
1147 
1148  NE10_INV_BUTTERFLY_TMP( 2,1, 3,3,
1149  1,2, 3,0);
1150 
1151  q_in3[1] *= - 1.0f;
1152  NE10_INV_BUTTERFLY_TMP( 3,1, 2,3,
1153  3,1, 1,3);
1154 #undef NE10_INV_BUTTERFLY_TMP
1155 
1156  // NE10_PRINT_Q_VECTOR(q_out0);
1157  // NE10_PRINT_Q_VECTOR(q_out1);
1158  // NE10_PRINT_Q_VECTOR(q_out2);
1159  // NE10_PRINT_Q_VECTOR(q_out3);
1160 
1161  // BUTTERFLY - radix 4x2
1162  // STAGE
1163  // q_out -> q_in
1164 
1165  // OUTPUT
1166  // INPUT
1167 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1168  q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \
1169  q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \
1170 } while(0);
1171 
1172  NE10_INV_BUTTERFLY_TMP(0,0, 0,2,
1173  0,0, 0,1);
1174 
1175  NE10_INV_BUTTERFLY_TMP(1,0, 1,2,
1176  1,0, 1,1);
1177 
1178  NE10_INV_BUTTERFLY_TMP(0,1, 0,3,
1179  0,2, 0,3);
1180 
1181  NE10_INV_BUTTERFLY_TMP(1,1, 1,3,
1182  1,2, 1,3);
1183 
1184  NE10_INV_BUTTERFLY_TMP(2,0, 2,2,
1185  2,0, 2,1);
1186 
1187  NE10_INV_BUTTERFLY_TMP(3,0, 3,2,
1188  3,0, 3,1);
1189 
1190 
1191  NE10_INV_BUTTERFLY_TMP(2,1, 2,3,
1192  2,2, 2,3);
1193 
1194  NE10_INV_BUTTERFLY_TMP(3,1, 3,3,
1195  3,2, 3,3);
1196 
1197  // NE10_PRINT_Q_VECTOR(q_in0);
1198  // NE10_PRINT_Q_VECTOR(q_in1);
1199  // NE10_PRINT_Q_VECTOR(q_in2);
1200  // NE10_PRINT_Q_VECTOR(q_in3);
1201 #undef NE10_INV_BUTTERFLY_TMP
1202 
1203  // load tw
1204  q2_tw0[0][0] = tw[0];
1205  q2_tw0[0][1] = tw[2];
1206  q2_tw0[0][2] = tw[4];
1207  q2_tw0[0][3] = tw[6];
1208  q2_tw0[1][0] = tw[1];
1209  q2_tw0[1][1] = tw[3];
1210  q2_tw0[1][2] = tw[5];
1211  q2_tw0[1][3] = tw[7];
1212 
1213  q2_tw1[0][0] = tw[0+8];
1214  q2_tw1[0][1] = tw[2+8];
1215  q2_tw1[0][2] = tw[4+8];
1216  q2_tw1[0][3] = tw[6+8];
1217  q2_tw1[1][0] = tw[1+8];
1218  q2_tw1[1][1] = tw[3+8];
1219  q2_tw1[1][2] = tw[5+8];
1220  q2_tw1[1][3] = tw[7+8];
1221 
1222  // TW: in->out
1223  q_out0[0] = q_in0[0];
1224  q_out1[0] = q_in1[0];
1225  q_out2[0] = q_in2[0];
1226  q_out3[0] = q_in3[0];
1227 
1228  //----------------------------------------------------------//
1229  // first 2 lines
1230  // R R R I I
1231  q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
1232  // I R I I R
1233  q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
1234 
1235  // R R R I I
1236  q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
1237  // I R I I R
1238  q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
1239 
1240  // R R R I I
1241  q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
1242  // I R I I R
1243  q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
1244 
1245  //----------------------------------------------------------//
1246  // second 2 lines
1247  // R R R I I
1248  q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
1249  // I R I I R
1250  q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
1251 
1252  // R R R I I
1253  q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
1254  // I R I I R
1255  q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
1256 
1257  // R R R I I
1258  q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
1259  // I R I I R
1260  q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
1261 
1262  // STORE
1263  *(fout_r++) = q_out0[0];
1264  *(fout_r++) = q_out0[1];
1265  *(fout_r++) = q_out0[2];
1266  *(fout_r++) = q_out0[3];
1267  *(fout_r++) = q_out1[0];
1268  *(fout_r++) = - q_out1[1];
1269  *(fout_r++) = - q_out1[2];
1270  *(fout_r++) = - q_out1[3];
1271  *(fout_r++) = q_out2[0];
1272  *(fout_r++) = q_out2[1];
1273  *(fout_r++) = q_out2[2];
1274  *(fout_r++) = q_out2[3];
1275  *(fout_r++) = q_out3[0];
1276  *(fout_r++) = - q_out3[1];
1277  *(fout_r++) = - q_out3[2];
1278  *(fout_r++) = - q_out3[3];
1279 }
1280 
1281 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
1282  const ne10_fft_cpx_float32_t *src,
1283  const ne10_fft_cpx_float32_t *twiddles,
1284  const ne10_int32_t nfft)
1285 {
1286  const ne10_float32_t *fin_r = ((const ne10_float32_t*) src) + 12 + 16;
1287  ne10_float32_t *fout_r = (ne10_float32_t*) dst + 8;
1288  ne10_float32_t *fout_b = (ne10_float32_t*) dst - 14;
1289  const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1290 
1291  // Take 4 elements as a set.
1292  // The leading 8 sets are already transformed in first and seconds butterflies.
1293  // This function transforms 8 sets in each loop.
1294  ne10_int32_t loop_count = ((nfft >> 2) - 8) >> 3;
1295 
1296  for (; loop_count > 0; loop_count--)
1297  {
1298  NE10_DECLARE_4 (float32x4x2_t, q2_in); // 8Q
1299  NE10_DECLARE_3 (float32x4x2_t, q2_tw); // 6Q
1300  NE10_DECLARE_4 (float32x4x2_t, q2_out); // 8Q
1301 
1302  /* INPUT
1303  * 0R 1R 2R 3R Q0
1304  * 0I 1I 2I 3I Q1
1305  * 4R 5R 6R 7R Q2
1306  * 4I 5I 6I 7I Q3
1307  * 8R 9R aR bR Q4
1308  * 8I 9I aI bI Q5
1309  * cR dR eR fR Q6
1310  * cI dI eI fI Q7
1311  */
1312 
1313  // transpose
1314  // q2_out -> q2_in
1315  /*
1316  * val[0]
1317  * 0R 4R 8R cR Q0
1318  * 1R 5R 9R dR Q2
1319  * 2R 6R aR eR Q4
1320  * 3R 7R bR fR Q6
1321  *
1322  * val[1]
1323  * 0I 4I 8I cI Q1
1324  * 1I 5I 9I dI Q3
1325  * 2I 6I aI eI Q5
1326  * 3I 7I bI fI Q7
1327  */
1328 
1329 #ifndef NE10_INLINE_ASM_OPT
1330  q2_out0.val[0] = vld1q_f32 (fin_r);
1331  fin_r += 4;
1332  q2_out0.val[1] = vld1q_f32 (fin_r);
1333  fin_r += 4;
1334  q2_out1.val[0] = vld1q_f32 (fin_r);
1335  fin_r += 4;
1336  q2_out1.val[1] = vld1q_f32 (fin_r);
1337  fin_r += 4;
1338  q2_out2.val[0] = vld1q_f32 (fin_r);
1339  fin_r += 4;
1340  q2_out2.val[1] = vld1q_f32 (fin_r);
1341  fin_r += 4;
1342  q2_out3.val[0] = vld1q_f32 (fin_r);
1343  fin_r += 4;
1344  q2_out3.val[1] = vld1q_f32 (fin_r);
1345  fin_r += 4;
1346 
1347  NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in, q2_out);
1348 #else // NE10_INLINE_ASM_OPT
1349 #ifndef __aarch64__
1350 #error Currently, inline assembly optimizations are only available on AArch64.
1351 #else // __aarch64__
1352  asm volatile (
1353  "ld1 {v0.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[0]
1354  "ld1 {v4.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[1]
1355  "ld1 {v1.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[0]
1356  "ld1 {v5.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[1]
1357  "ld1 {v2.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[0]
1358  "ld1 {v6.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[1]
1359  "ld1 {v3.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[0]
1360  "ld1 {v7.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[1]
1361  // NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1362  "trn1 v8.4s, v0.4s, v1.4s \n\t"
1363  "trn2 v9.4s, v0.4s, v1.4s \n\t"
1364  "trn1 v10.4s, v2.4s, v3.4s \n\t"
1365  "trn2 v11.4s, v2.4s, v3.4s \n\t"
1366 
1367  "trn1 %[q2_in0r].2d, v8.2d, v10.2d \n\t"
1368  "trn1 %[q2_in1r].2d, v9.2d, v11.2d \n\t"
1369  "trn2 %[q2_in2r].2d, v8.2d, v10.2d \n\t"
1370  "trn2 %[q2_in3r].2d, v9.2d, v11.2d \n\t"
1371 
1372  "trn1 v8.4s, v4.4s, v5.4s \n\t"
1373  "trn2 v9.4s, v4.4s, v5.4s \n\t"
1374  "trn1 v10.4s, v6.4s, v7.4s \n\t"
1375  "trn2 v11.4s, v6.4s, v7.4s \n\t"
1376 
1377  "trn1 %[q2_in0i].2d, v8.2d, v10.2d \n\t"
1378  "trn1 %[q2_in1i].2d, v9.2d, v11.2d \n\t"
1379  "trn2 %[q2_in2i].2d, v8.2d, v10.2d \n\t"
1380  "trn2 %[q2_in3i].2d, v9.2d, v11.2d \n\t"
1381 
1382  : [q2_in0r]"+w"(q2_in0.val[0]),
1383  [q2_in0i]"+w"(q2_in0.val[1]),
1384  [q2_in1r]"+w"(q2_in1.val[0]),
1385  [q2_in1i]"+w"(q2_in1.val[1]),
1386  [q2_in2r]"+w"(q2_in2.val[0]),
1387  [q2_in2i]"+w"(q2_in2.val[1]),
1388  [q2_in3r]"+w"(q2_in3.val[0]),
1389  [q2_in3i]"+w"(q2_in3.val[1]),
1390  [fin_r]"+r"(fin_r)
1391  :
1392  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1393  "v8", "v9", "v10", "v11"
1394  );
1395 #endif // __aarch64__
1396 #endif // NE10_INLINE_ASM_OPT
1397 
1398 #ifndef NE10_INLINE_ASM_OPT
1399  // Load twiddles
1400  q2_tw0 = vld2q_f32 (tw);
1401  tw += 8;
1402  q2_tw1 = vld2q_f32 (tw);
1403  tw += 8;
1404  q2_tw2 = vld2q_f32 (tw);
1405  tw += 8;
1406 
1407  // tw
1408  // q2_in -> q2_out
1409  q2_out0 = q2_in0;
1410  NE10_CPX_MUL_NEON_F32 (q2_out1, q2_in1, q2_tw0);
1411  NE10_CPX_MUL_NEON_F32 (q2_out2, q2_in2, q2_tw1);
1412  NE10_CPX_MUL_NEON_F32 (q2_out3, q2_in3, q2_tw2);
1413 #else // NE10_INLINE_ASM_OPT
1414 #ifndef __aarch64__
1415 #error Currently, inline assembly optimizations are only available on AArch64.
1416 #else // __aarch64__
1417  asm volatile (
1418  // Load twiddles
1419  "ld2 {v0.4s, v1.4s}, [%[tw0]] \n\t" // q2_tw0
1420  "ld2 {v2.4s, v3.4s}, [%[tw1]] \n\t" // q2_tw1
1421  "ld2 {v4.4s, v5.4s}, [%[tw2]] \n\t" // q2_tw2
1422  // tw
1423  // q2_in -> q2_out
1424  // NE10_CPX_MUL_NEON_F32(q2_out1,q2_in1,q2_tw0);
1425  "fmul %[q2_out1r].4s, v0.4s, %[q2_in1r].4s \n\t" // RR
1426  "fmul %[q2_out1i].4s, v0.4s, %[q2_in1i].4s \n\t" // RI
1427  "fmls %[q2_out1r].4s, v1.4s, %[q2_in1i].4s \n\t" // RR - II
1428  "fmla %[q2_out1i].4s, v1.4s, %[q2_in1r].4s \n\t" // RI + IR
1429  // NE10_CPX_MUL_NEON_F32(q2_out2,q2_in2,q2_tw1);
1430  "fmul %[q2_out2r].4s, v2.4s, %[q2_in2r].4s \n\t" // RR
1431  "fmul %[q2_out2i].4s, v2.4s, %[q2_in2i].4s \n\t" // RI
1432  "fmls %[q2_out2r].4s, v3.4s, %[q2_in2i].4s \n\t" // RR - II
1433  "fmla %[q2_out2i].4s, v3.4s, %[q2_in2r].4s \n\t" // RI + IR
1434  // NE10_CPX_MUL_NEON_F32(q2_out3,q2_in3,q2_tw2);
1435  "fmul %[q2_out3r].4s, v4.4s, %[q2_in3r].4s \n\t" // RR
1436  "fmul %[q2_out3i].4s, v4.4s, %[q2_in3i].4s \n\t" // RI
1437  "fmls %[q2_out3r].4s, v5.4s, %[q2_in3i].4s \n\t" // RR - II
1438  "fmla %[q2_out3i].4s, v5.4s, %[q2_in3r].4s \n\t" // RI + IR
1439  : [q2_out1r]"+w"(q2_out1.val[0]),
1440  [q2_out1i]"+w"(q2_out1.val[1]),
1441  [q2_out2r]"+w"(q2_out2.val[0]),
1442  [q2_out2i]"+w"(q2_out2.val[1]),
1443  [q2_out3r]"+w"(q2_out3.val[0]),
1444  [q2_out3i]"+w"(q2_out3.val[1])
1445  : [tw0]"r"(tw),
1446  [tw1]"r"(tw + 8),
1447  [tw2]"r"(tw + 16),
1448  [q2_in1r]"w"(q2_in1.val[0]),
1449  [q2_in1i]"w"(q2_in1.val[1]),
1450  [q2_in2r]"w"(q2_in2.val[0]),
1451  [q2_in2i]"w"(q2_in2.val[1]),
1452  [q2_in3r]"w"(q2_in3.val[0]),
1453  [q2_in3i]"w"(q2_in3.val[1])
1454  : "memory", "v0", "v1", "v2", "v3", "v4", "v5"
1455  );
1456  q2_out0 = q2_in0;
1457  tw += 24;
1458 #endif // __aarch64__
1459 #endif // NE10_INLINE_ASM_OPT
1460 
1461  // butterfly
1462  // out -> in
1463  q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
1464  q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
1465  q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
1466  q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
1467  q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
1468  q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
1469  q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
1470  q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
1471 
1472  // in -> out
1473  q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1474  q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1475  q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
1476  q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
1477 
1478  q2_out3.val[1] = vnegq_f32 (q2_out3.val[1]);
1479  q2_out2.val[1] = vnegq_f32 (q2_out2.val[1]);
1480 
1481 #ifndef NE10_INLINE_ASM_OPT
1482  q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1483  q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1484 
1485  q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
1486  q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
1487 
1488  // reverse -- CONJ
1489  NE10_REVERSE_FLOAT32X4 (q2_out2.val[0]);
1490  NE10_REVERSE_FLOAT32X4 (q2_out2.val[1]);
1491  NE10_REVERSE_FLOAT32X4 (q2_out3.val[0]);
1492  NE10_REVERSE_FLOAT32X4 (q2_out3.val[1]);
1493 
1494  // store
1495  vst2q_f32 (fout_r, q2_out0);
1496  vst2q_f32 (fout_r + (nfft >> 1), q2_out1);
1497  vst2q_f32 (fout_b + (nfft >> 1), q2_out3);
1498  vst2q_f32 (fout_b + nfft, q2_out2);
1499 #else // NE10_INLINE_ASM_OPT
1500 #ifndef __aarch64__
1501 #error Currently, inline assembly optimizations are only available on AArch64.
1502 #else // __aarch64__
1503  asm volatile (
1504  "fadd v0.4s, %[q2_in0r].4s, %[q2_in2r].4s \n\t"
1505  "fadd v1.4s, %[q2_in0i].4s, %[q2_in2i].4s \n\t"
1506  "fadd v2.4s, %[q2_in1r].4s, %[q2_in3i].4s \n\t"
1507  "fsub v3.4s, %[q2_in1i].4s, %[q2_in3r].4s \n\t"
1508  // reverse -- CONJ
1509  "rev64 %[q2_in2r].4s, %[q2_out2r].4s \n\t"
1510  "rev64 %[q2_in2i].4s, %[q2_out2i].4s \n\t"
1511  "rev64 %[q2_in3r].4s, %[q2_out3r].4s \n\t"
1512  "rev64 %[q2_in3i].4s, %[q2_out3i].4s \n\t"
1513  "ext v4.16b, %[q2_in2r].16b, %[q2_in2r].16b, #8 \n\t"
1514  "ext v5.16b, %[q2_in2i].16b, %[q2_in2i].16b, #8 \n\t"
1515  "ext v6.16b, %[q2_in3r].16b, %[q2_in3r].16b, #8 \n\t"
1516  "ext v7.16b, %[q2_in3i].16b, %[q2_in3i].16b, #8 \n\t"
1517  // store
1518  "st2 {v0.4s, v1.4s}, [%[fout0]] \n\t"
1519  "st2 {v2.4s, v3.4s}, [%[fout1]] \n\t"
1520  "st2 {v4.4s, v5.4s}, [%[fout2]] \n\t"
1521  "st2 {v6.4s, v7.4s}, [%[fout3]] \n\t"
1522  :
1523  : [fout0]"r"(fout_r),
1524  [fout1]"r"(fout_r + (nfft>>1)),
1525  [fout2]"r"(fout_b + nfft),
1526  [fout3]"r"(fout_b + (nfft>>1)),
1527  [q2_out2r]"w"(q2_out2.val[0]),
1528  [q2_out2i]"w"(q2_out2.val[1]),
1529  [q2_out3r]"w"(q2_out3.val[0]),
1530  [q2_out3i]"w"(q2_out3.val[1]),
1531  [q2_in0r]"w"(q2_in0.val[0]),
1532  [q2_in0i]"w"(q2_in0.val[1]),
1533  [q2_in1r]"w"(q2_in1.val[0]),
1534  [q2_in1i]"w"(q2_in1.val[1]),
1535  [q2_in2r]"w"(q2_in2.val[0]),
1536  [q2_in2i]"w"(q2_in2.val[1]),
1537  [q2_in3r]"w"(q2_in3.val[0]),
1538  [q2_in3i]"w"(q2_in3.val[1])
1539  : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1540  );
1541 #endif // __aarch64__
1542 #endif // NE10_INLINE_ASM_OPT
1543 
1544  fout_r += 8;
1545  fout_b -= 8;
1546  }
1547 }
1548 
1549 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
1550  const ne10_fft_cpx_float32_t *src,
1551  const ne10_fft_cpx_float32_t *twiddles,
1552  const ne10_int32_t nfft)
1553 {
1554  ne10_float32_t *fout_r = ((ne10_float32_t*) dst ) + 12 + 16 ;
1555  const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 8;
1556  const ne10_float32_t *fin_b = (const ne10_float32_t*) src - 14;
1557  const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1558  ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
1559 
1560  for ( ; loop_count>0; loop_count -- )
1561  {
1562  NE10_DECLARE_4(float32x4x2_t,q2_in); // 8Q
1563  NE10_DECLARE_3(float32x4x2_t,q2_tw); // 6Q
1564  NE10_DECLARE_4(float32x4x2_t,q2_out); // 8Q
1565 
1566  /* INPUT
1567  * 0R 1R 2R 3R Q0
1568  * 0I 1I 2I 3I Q1
1569  * 4R 5R 6R 7R Q2
1570  * 4I 5I 6I 7I Q3
1571  * 8R 9R aR bR Q4
1572  * 8I 9I aI bI Q5
1573  * cR dR eR fR Q6
1574  * cI dI eI fI Q7
1575  */
1576 
1577  q2_in0 = vld2q_f32(fin_r );
1578  q2_in1 = vld2q_f32(fin_r + (nfft>>1));
1579  fin_r += 8;
1580 
1581  q2_in3 = vld2q_f32(fin_b + (nfft>>1));
1582  q2_in2 = vld2q_f32(fin_b + nfft );
1583  fin_b -= 8;
1584 
1585  q2_tw0 = vld2q_f32(tw);
1586  tw += 8;
1587  q2_tw1 = vld2q_f32(tw);
1588  tw += 8;
1589  q2_tw2 = vld2q_f32(tw);
1590  tw += 8;
1591 
1592  // reverse -- CONJ
1593  NE10_REVERSE_FLOAT32X4( q2_in3.val[0] );
1594  NE10_REVERSE_FLOAT32X4( q2_in3.val[1] );
1595  NE10_REVERSE_FLOAT32X4( q2_in2.val[0] );
1596  NE10_REVERSE_FLOAT32X4( q2_in2.val[1] );
1597 
1598  q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
1599  q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
1600 
1601  // in -> out
1602  q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1603  q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1604 
1605  q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1606  q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1607 
1608  q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1609  q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1610 
1611  q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
1612  q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
1613 
1614  // out -> in
1615  q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
1616  q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
1617 
1618  q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
1619  q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
1620 
1621  q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
1622  q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
1623 
1624  q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
1625  q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
1626 
1627  // tw
1628  // q2_in -> q2_out
1629  q2_out0 = q2_in0;
1630  NE10_CPX_MUL_INV_NEON_F32(q2_out1,q2_in1,q2_tw0);
1631  NE10_CPX_MUL_INV_NEON_F32(q2_out2,q2_in2,q2_tw1);
1632  NE10_CPX_MUL_INV_NEON_F32(q2_out3,q2_in3,q2_tw2);
1633 
1634  // transpose
1635  // q2_out -> q2_in
1636  NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1637 
1638  // store
1639  vst1q_f32(fout_r, q2_in0.val[0]);
1640  fout_r += 4;
1641  vst1q_f32(fout_r, q2_in0.val[1]);
1642  fout_r += 4;
1643  vst1q_f32(fout_r, q2_in1.val[0]);
1644  fout_r += 4;
1645  vst1q_f32(fout_r, q2_in1.val[1]);
1646  fout_r += 4;
1647  vst1q_f32(fout_r, q2_in2.val[0]);
1648  fout_r += 4;
1649  vst1q_f32(fout_r, q2_in2.val[1]);
1650  fout_r += 4;
1651  vst1q_f32(fout_r, q2_in3.val[0]);
1652  fout_r += 4;
1653  vst1q_f32(fout_r, q2_in3.val[1]);
1654  fout_r += 4;
1655  }
1656 }
1657 
1658 NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage( ne10_fft_cpx_float32_t *dst,
1659  const ne10_fft_cpx_float32_t *src,
1660  const ne10_fft_cpx_float32_t *twiddles,
1661  const ne10_int32_t nfft)
1662 {
1663  ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(dst,src,twiddles,nfft);
1664 
1665  if (nfft==16)
1666  {
1667  return;
1668  }
1669 
1670  ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(dst,src,twiddles,nfft);
1671 
1672  if (nfft==32)
1673  {
1674  return;
1675  }
1676 
1677  ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(dst,src,twiddles,nfft);
1678 }
1679 
1680 NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage( ne10_fft_cpx_float32_t *dst,
1681  const ne10_fft_cpx_float32_t *src,
1682  const ne10_fft_cpx_float32_t *twiddles,
1683  const ne10_int32_t nfft)
1684 {
1685  ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(dst,src,twiddles,nfft);
1686 
1687  if (nfft==16)
1688  {
1689  return;
1690  }
1691 
1692  ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(dst,src,twiddles,nfft);
1693 
1694  if (nfft==32)
1695  {
1696  return;
1697  }
1698 
1699  ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(dst,src,twiddles,nfft);
1700 }
1701 
1718  ne10_float32_t *fin,
1720 {
1721  typedef ne10_float32_t REAL;
1722  typedef ne10_fft_cpx_float32_t CPLX;
1723 
1724  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1725  ne10_float32_t *fout_r = (ne10_float32_t*) fout;
1726 
1727  switch (cfg->nfft)
1728  {
1729  case 8:
1730  ne10_radix8_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin, 1, 1, 8);
1731  fout[0].r = fout[0].i;
1732  break;
1733  default:
1734  ne10_mixed_radix_r2c_butterfly_float32_neon (fout, (CPLX*) fin, cfg->r_factors_neon, cfg->r_twiddles_neon, tmpbuf);
1735  ne10_radix4_r2c_with_twiddles_last_stage(fout, tmpbuf, cfg->r_super_twiddles_neon, cfg->nfft);
1736  fout[cfg->nfft / 2].r = fout[0].i;
1737  break;
1738  }
1739  fout[0].i = fout[cfg->nfft / 2].i = 0.0f;
1740 }
1741 
1752 void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
1755 {
1756  typedef ne10_float32_t REAL;
1757  typedef ne10_fft_cpx_float32_t CPLX;
1758 
1759  ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1760  ne10_fft_cpx_float32_t * fout_c;
1761  ne10_int32_t stage_count;
1762  ne10_int32_t radix;
1763 
1764  switch (cfg->nfft)
1765  {
1766  case 8:
1767  fin[0].i = fin[0].r;
1768  fin[0].r = 0.0f;
1769  ne10_radix8_c2r_c ( (CPLX*) fout, (const CPLX*) &fin[0].i, 1, 1, 8);
1770  fin[0].r = fin[0].i;
1771  break;
1772  default:
1773  stage_count = cfg->r_factors_neon[0];
1774  radix = cfg->r_factors_neon[ stage_count << 1 ];
1775  if (radix==2)
1776  {
1777  stage_count --;
1778  }
1779  fin[0].i = fin[cfg->nfft>>1].r;
1780  fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
1781  ne10_radix4_c2r_with_twiddles_first_stage( (CPLX*) fout_c, fin, cfg->r_super_twiddles_neon, cfg->nfft);
1782  ne10_mixed_radix_c2r_butterfly_float32_neon ( (CPLX*) fout, (CPLX*) NULL, cfg->r_factors_neon, cfg->r_twiddles_neon_backward, tmpbuf);
1783  break;
1784  }
1785  fin[0].i = 0.0f;
1786 }
1787 
ne10_fft_cpx_float32_t
Definition: NE10_types.h:230
ne10_fft_r2c_state_float32_t
Definition: NE10_types.h:272
ne10_fft_c2r_1d_float32_neon
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
Definition: NE10_rfft_float32.neonintrinsic.c:1752
ne10_fft_r2c_1d_float32_neon
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
Definition: NE10_rfft_float32.neonintrinsic.c:1717