Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
common
factor.h
1
/*
2
* Copyright 2011-15 ARM Limited and Contributors.
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of ARM Limited nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
15
*
16
* THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
/*
29
* NE10 Library : common/factor.h
30
*/
31
32
// Typebuilding MACROs
33
// - Slight difference between toolchain versions on intrinsics
34
#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
35
{{ \
36
{x1, y1}, {x2,y2}, {x3,y3} \
37
}}
38
39
// Unit test use this macro to index into their function table
40
// "opc" stands for operation's code (which function),
41
// and "imp" stands for implementation (which implementation of the function)
42
#define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
43
44
// This macro helps measure the performance of the code passed to it through the "code" argument
45
// It is used in the unit tests
46
#define MEASURE(res, code) \
47
{ \
48
gettimeofday (&before, &zone); \
49
code \
50
gettimeofday (&after, &zone); \
51
if (before.tv_usec > after.tv_usec) \
52
{ \
53
after.tv_usec += 1000000; \
54
after.tv_sec--; \
55
} \
56
lapsed.tv_usec = after.tv_usec - before.tv_usec; \
57
lapsed.tv_sec = after.tv_sec - before.tv_sec; \
58
res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
59
}
60
61
// There are several categories of functions that share common code:
62
63
// Different groups of functions take different number of inputs
64
//
65
// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
66
// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
67
// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
68
//
69
// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
70
// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
71
// Group 6 = These take a dst, and a src ("DstSrc")
72
//
73
74
// The naming convention used in the following macros is as follows:
75
// SNAPP_<A>_OPERATION_<T>_<I>
76
// where
77
// <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
78
// The letter X - if used - means any such operation.
79
// <T> Indicates the type of the operation (float, vec2, etc.)
80
// The letter X - is used - means any type.
81
// <I> This indicates the implementation (it can be C, ASM, or NEON).
82
83
// A few macros to check pointers and their address range to make sure there's
84
// no unwanted overlap between any two of them
85
#define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
86
if ( (void *)dst < (void *)src ) \
87
{ assert ( (void *)dst + count <= (void *)src ); } \
88
else if ( (void *)dst > (void *)src ) \
89
{ assert ( (void *)src + count <= (void *)dst ); }
90
91
#define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
92
93
#define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
94
if ( (void *)arg1 < (void *)arg2 ) \
95
{ assert ( (void *)arg1 + count <= (void *)arg2 ); } \
96
else if ( (void *)arg1 > (void *)arg2 ) \
97
{ assert ( (void *)arg2 + count <= (void *)arg1 ); } \
98
if ( (void *)arg1 < (void *)arg3 ) \
99
{ assert ( (void *)arg1 + count <= (void *)arg3 ); } \
100
else if ( (void *)arg1 > (void *)arg3 ) \
101
{ assert ( (void *)arg3 + count <= (void *)arg1 ); } \
102
if ( (void *)arg3 < (void *)arg2 ) \
103
{ assert ( (void *)arg3 + count <= (void *)arg2 ); } \
104
else if ( (void *)arg3 > (void *)arg2 ) \
105
{ assert ( (void *)arg2 + count <= (void *)arg3 ); }
106
107
#define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
108
NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
109
if ( (void *)arg1 < (void *)arg4 ) \
110
{ assert ( (void *)arg1 + count <= (void *)arg4 ); } \
111
else if ( (void *)arg1 > (void *)arg4 ) \
112
{ assert ( (void *)arg4 + count <= (void *)arg1 ); } \
113
if ( (void *)arg2 < (void *)arg4 ) \
114
{ assert ( (void *)arg2 + count <= (void *)arg4 ); } \
115
else if ( (void *)arg2 > (void *)arg4 ) \
116
{ assert ( (void *)arg4 + count <= (void *)arg2 ); } \
117
if ( (void *)arg4 < (void *)arg3 ) \
118
{ assert ( (void *)arg4 + count <= (void *)arg3 ); } \
119
else if ( (void *)arg4 > (void *)arg3 ) \
120
{ assert ( (void *)arg3 + count <= (void *)arg4 ); }
121
122
123
124
#define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
125
NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
126
127
#define NE10_CHECKPOINTER_DstCst_OPERATION {}
128
129
#define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
130
NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
131
132
#define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
133
NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
134
135
// These macros generalise implementation of the functions.
136
137
// Macros used in C implementations
138
#define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
139
ne10_result_t res = NE10_OK; \
140
unsigned int itr = 0; \
141
checkPointer; \
142
for ( itr = 0; itr < count; itr++ ) \
143
{ loopCode ;
/* this loop iterates through each and every float item one at a time */
\
144
} \
145
return res; \
146
}
147
148
// macros used in the NEON implementations
149
150
// Main Loop = The loop where the number of items to be processed is exactly the
151
// number that we can process in a single iteration.
152
//
153
// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
154
// did not fit into the Main Loop. This is needed when the number of
155
// input items is not a multiple of the number of items that we
156
// process in every iteration of the Main Loop.
157
158
159
/****************************************************
160
* *
161
* The "DstSrcCst" group of functions *
162
* *
163
****************************************************/
164
166
167
#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
168
/* load 4 values */
\
169
n_src = vld1q_f32( (float32_t*)src ); \
170
src += 4;
/* move to the next 4 float items; 4*float */
\
171
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
172
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store the results back */
\
173
dst += 4;
/* move to the next items; 4*float */
\
174
}
175
176
#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
177
float32x2_t n_tmp_src = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
178
float32x2_t n_tmp_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
179
n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0);
/* load into the first lane of d0 */
\
180
loopCode;
/* the actual operation is placed here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
181
vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0);
/* store the lane back into the memory */
\
182
/* move to the next item in the stream */
\
183
src++; \
184
dst++; \
185
}
186
187
#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
188
ne10_result_t res = NE10_OK; \
189
float32x4_t n_src; \
190
float32x4_t n_dst; \
191
checkPointer; \
192
int dif = 0; \
193
dif = count % 4;
/* either 0 or one of 1,2,3; in the latter cases the second path is taken */
\
194
for (; count > dif; count -= 4) { \
195
loopCode1; \
196
} \
197
if ( 0 != dif ) { \
198
unsigned int idx; \
199
for ( idx = 0 ; idx < dif; idx++ ) { \
200
loopCode2; \
201
} \
202
} \
203
return res; \
204
}
205
207
208
#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
209
n_src = vld1q_f32( (float32_t*)src );
/* load two vectors */
\
210
src += 2;
/* move to the next two vectors */
\
211
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
212
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store back */
\
213
dst += 2;
/* move to the next 2 vectors */
\
214
}
215
216
#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
217
float32x2_t n_tmp_src; \
218
float32x2_t n_tmp_cst = { cst->x, cst->y }; \
219
n_tmp_src = vld1_f32( (float32_t*)src ); \
220
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
221
vst1_f32( (float32_t*)dst, n_tmp_src); \
222
}
223
224
#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
225
ne10_result_t res = NE10_OK; \
226
float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
227
float32x4_t n_src; \
228
float32x4_t n_dst; \
229
checkPointer; \
230
int dif = count % 2; \
231
for (; count > dif; count -= 2) { \
232
loopCode1; \
233
} \
234
if ( 0 != dif ) { \
235
loopCode2; \
236
} \
237
return res; \
238
}
239
241
242
#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
243
n_src1 = vld1q_f32( (float32_t*)src ); \
244
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
245
n_src2 = vld1q_f32( (float32_t*)src ); \
246
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
247
n_src3 = vld1q_f32( (float32_t*)src ); \
248
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
249
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
250
vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
251
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
252
vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
253
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
254
vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
255
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
256
}
257
258
#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
259
float32x2x3_t n_tmp_src = FLOAT32_2x3( \
260
0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
261
float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
262
(const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
263
n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
264
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
265
vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
266
src++; \
267
dst++; \
268
}
269
270
#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
271
ne10_result_t res = NE10_OK; \
272
float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
273
float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
274
float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
275
float32x4_t n_src1, n_src2, n_src3; \
276
float32x4_t n_dst1, n_dst2, n_dst3; \
277
checkPointer; \
278
int dif = count % 4; \
279
for (; count > dif; count -= 4) { \
280
loopCode1; \
281
} \
282
if ( 0 != dif ) { \
283
unsigned int idx; \
284
for ( idx = 0 ; idx < dif; idx++ ) { \
285
loopCode2; \
286
} \
287
} \
288
return res; \
289
}
290
292
293
/* Note that for the VEC4* types, we do not need a second loop as the number
294
of input items is always a multiple of four. */
295
296
#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
297
n_src = vld1q_f32( (float32_t*)src ); \
298
src ++; \
299
loopCode; \
300
vst1q_f32 ( (float32_t*)dst , n_dst );
/* The main loop iterates through one 4D vector each time */
\
301
dst ++; \
302
}
303
304
#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
305
ne10_result_t res = NE10_OK; \
306
float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
307
float32x4_t n_src; \
308
float32x4_t n_dst; \
309
checkPointer; \
310
for (; count != 0; count --) { \
311
loopCode; \
312
} \
313
return res; \
314
}
315
316
/****************************************************
317
* *
318
* The "DstAccSrcCst" group of functions *
319
* *
320
****************************************************/
321
323
324
#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
325
/* load 4 values */
\
326
n_acc = vld1q_f32( (float32_t*)acc ); \
327
n_src = vld1q_f32( (float32_t*)src ); \
328
acc += 4;
/* move to the next 4 float items; 4*float */
\
329
src += 4; \
330
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
331
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store theresults back */
\
332
dst += 4;
/* move to the next items; 4*float */
\
333
}
334
335
#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
336
float32x2_t n_tmp_acc = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
337
float32x2_t n_tmp_src = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
338
float32x2_t n_tmp_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
339
n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0);
/* load into the first lane of d0 */
\
340
n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0);
/* load into the first lane of d1 */
\
341
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
342
vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0);
/* store the lane back into the memory */
\
343
/* move to the next item in the stream */
\
344
acc++; \
345
src++; \
346
dst++; \
347
}
348
349
#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
350
352
353
#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
354
n_acc = vld1q_f32( (float32_t*)acc );
/* load two vectors */
\
355
n_src = vld1q_f32( (float32_t*)src );
/* load two vectors */
\
356
acc += 2;
/* move to the next two vectors */
\
357
src += 2; \
358
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
359
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store back */
\
360
dst += 2;
/* move to the next 2 vectors */
\
361
}
362
363
#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
364
float32x2_t n_tmp_acc; \
365
float32x2_t n_tmp_src; \
366
float32x2_t n_tmp_cst = { cst->x, cst->y }; \
367
n_tmp_acc = vld1_f32( (float32_t*)acc ); \
368
n_tmp_src = vld1_f32( (float32_t*)src ); \
369
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
370
vst1_f32( (float32_t*)dst, n_tmp_src); \
371
}
372
373
#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
374
376
377
#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
378
n_acc1 = vld1q_f32( (float32_t*)acc );
/* Load accumulator values */
\
379
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
380
n_acc2 = vld1q_f32( (float32_t*)acc ); \
381
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
382
n_acc3 = vld1q_f32( (float32_t*)acc ); \
383
acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
384
n_src1 = vld1q_f32( (float32_t*)src );
/* Load source values */
\
385
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
386
n_src2 = vld1q_f32( (float32_t*)src ); \
387
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
388
n_src3 = vld1q_f32( (float32_t*)src ); \
389
src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
390
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
391
vst1q_f32 ( (float32_t*)dst , n_dst1 );
/* Store the results back into the memory */
\
392
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
393
vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
394
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
395
vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
396
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
397
}
398
399
#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
400
float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
401
0.0f, 0.0f, \
402
0.0f, 0.0f, \
403
0.0f, 0.0f \
404
); \
405
float32x2x3_t n_tmp_src = FLOAT32_2x3( \
406
0.0f, 0.0f, \
407
0.0f, 0.0f, \
408
0.0f, 0.0f \
409
); \
410
float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
411
(const float32x2_t){cst->y, 0}, \
412
(const float32x2_t){cst->z, 0} }; \
413
n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
414
n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
415
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
416
vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
417
acc++; \
418
src++; \
419
dst++; \
420
}
421
422
#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
423
425
426
#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
427
n_acc = vld1q_f32( (float32_t*)acc ); \
428
n_src = vld1q_f32( (float32_t*)src ); \
429
acc ++; \
430
src ++; \
431
loopCode; \
432
vst1q_f32 ( (float32_t*)dst , n_dst );
/* The main loop iterates through one 4D vector each time */
\
433
dst ++; \
434
}
435
436
#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
437
438
/****************************************************
439
* *
440
* The "DstCst" group of functions *
441
* *
442
****************************************************/
443
445
446
#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
447
/* load 4 values */
\
448
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
449
vst1q_f32 ( (float32_t*)dst , n_cst );
/* store theresults back */
\
450
dst += 4;
/* move to the next items; 4*float */
\
451
}
452
453
#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
454
float32x2_t n_tmp_cst = { cst, cst };
/* temporary constant value for use in the main NEON operation */
\
455
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
456
vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0);
/* store the lane back into the memory */
\
457
/* move to the next item in the stream */
\
458
dst++; \
459
}
460
461
#define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
462
ne10_result_t res = NE10_OK; \
463
checkPointer; \
464
int dif = 0; \
465
dif = count % 4;
/* either 0 or one of 1,2,3; in the latter cases the second path is taken */
\
466
for (; count > dif; count -= 4) { \
467
loopCode1; \
468
} \
469
if ( 0 != dif ) { \
470
unsigned int idx; \
471
for ( idx = 0 ; idx < dif; idx++ ) { \
472
loopCode2; \
473
} \
474
} \
475
return res; \
476
}
477
479
480
481
#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
482
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
483
vst1q_f32 ( (float32_t*)dst , n_cst );
/* store back */
\
484
dst += 2;
/* move to the next 2 vectors */
\
485
}
486
487
#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
488
float32x2_t n_tmp_cst = { cst->x, cst->y }; \
489
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
490
vst1_f32( (float32_t*)dst, n_tmp_cst); \
491
}
492
493
#define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
494
ne10_result_t res = NE10_OK; \
495
float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
496
checkPointer; \
497
int dif = count % 2; \
498
for (; count > dif; count -= 2) { \
499
loopCode1; \
500
} \
501
if ( 0 != dif ) { \
502
loopCode2; \
503
} \
504
return res; \
505
}
506
508
509
#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
510
loopCode;
/* The main loop iterates through three 3D vectors each time */
\
511
vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
512
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
513
vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
514
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
515
vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
516
dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
517
}
518
519
#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
520
float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
521
(const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
522
loopCode;
/* exceptional cases where the count isn't a multiple of 3 */
\
523
vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
524
dst++; \
525
}
526
527
#define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
528
ne10_result_t res = NE10_OK; \
529
float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
530
float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
531
float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
532
checkPointer; \
533
int dif = count % 4; \
534
for (; count > dif; count -= 4) { \
535
loopCode1; \
536
} \
537
if ( 0 != dif ) { \
538
unsigned int idx; \
539
for ( idx = 0 ; idx < dif; idx++ ) { \
540
loopCode2; \
541
} \
542
} \
543
return res; \
544
}
545
547
548
#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
549
loopCode; \
550
vst1q_f32 ( (float32_t*)dst , n_cst );
/* The main loop iterates through one 4D vector each time */
\
551
dst ++; \
552
}
553
554
#define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
555
ne10_result_t res = NE10_OK; \
556
float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
557
checkPointer; \
558
for (; count != 0; count --) { \
559
loopCode; \
560
} \
561
return res; \
562
}
563
564
/****************************************************
565
* *
566
* The "DstSrc1Src2" group of functions *
567
* *
568
****************************************************/
569
571
572
#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
573
/* load 4 values */
\
574
n_src = vld1q_f32( (float32_t*)src1 ); \
575
src1 += 4;
/* move to the next 4 float items; 4*float */
\
576
n_src2 = vld1q_f32( (float32_t*)src2 ); \
577
src2 += 4;
/* move to the next 4 float items; 4*float */
\
578
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
579
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store the results back */
\
580
dst += 4;
/* move to the next items; 4*float */
\
581
}
582
583
#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
584
float32x2_t n_tmp_src = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
585
float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
586
n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0);
/* load into the first lane of d0 */
\
587
n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
588
loopCode;
/* the actual operation is placed here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
589
vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0);
/* store the lane back into the memory */
\
590
/* move to the next item in the stream */
\
591
src1++; \
592
src2++; \
593
dst++; \
594
}
595
596
#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
597
598
/****************************************************
599
* *
600
* The "DstAccSrc1Src2" group of functions *
601
* *
602
****************************************************/
603
605
606
#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
607
/* load 4 values */
\
608
n_acc = vld1q_f32( (float32_t*)acc ); \
609
n_src = vld1q_f32( (float32_t*)src1 ); \
610
n_src2 = vld1q_f32( (float32_t*)src2 ); \
611
acc += 4;
/* move to the next 4 float items; 4*float */
\
612
src1 += 4; \
613
src2 += 4; \
614
loopCode;
/* the actual operation is placed here... */
/* The main loop iterates through four float values each time */
\
615
vst1q_f32 ( (float32_t*)dst , n_dst );
/* store theresults back */
\
616
dst += 4;
/* move to the next items; 4*float */
\
617
}
618
619
#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
620
float32x2_t n_tmp_acc = { 0.0f , 0.0f };
/* temporary storage to be used with NEON load/store intrinsics */
\
621
float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
622
float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
623
n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0);
/* load into the first lane of d0 */
\
624
n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0);
/* load into the first lane of d1 */
\
625
n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0);
/* load into the first lane of d2 */
\
626
loopCode;
/* the actual operation is palced here ... */
/* exceptional cases where the count is not a multiple of 4 */
\
627
vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0);
/* store the lane back into the memory */
\
628
/* move to the next item in the stream */
\
629
acc++; \
630
src1++; \
631
src2++; \
632
dst++; \
633
}
634
635
#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
636
637
/****************************************************
638
* *
639
* The "DstSrc" group of functions *
640
* *
641
****************************************************/
642
644
645
#define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
646
647
#define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
648
649
#define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
650
652
653
#define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
654
n_src = vld2_f32( (float32_t*)src );
/* load two vectors */
\
655
src += 2;
/* move to the next two vectors */
\
656
loopCode;
/* actual operation */
/* The main loop iterates through two 2D vectors each time */
\
657
/* store the results and increment the destination pointer within the loopCode */
\
658
}
659
660
#define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
661
loopCode;
/* exceptional cases where the count isn't a multiple of 2 */
\
662
/* store the results within the loopCode */
\
663
}
664
665
#define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
666
ne10_result_t res = NE10_OK; \
667
float32x2x2_t n_src; \
668
float32x2_t n_dst; \
669
checkPointer; \
670
int dif = count % 2; \
671
for (; count > dif; count -= 2) { \
672
loopCode1; \
673
} \
674
if ( 0 != dif ) { \
675
loopCode2; \
676
} \
677
return res; \
678
}
679
681
682
#define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
683
n_src = vld3q_f32( (float32_t*)src ); \
684
src = ((void*)src)+(12*sizeof(ne10_float32_t)); \
685
loopCode;
/* The main loop iterates through four 3D vectors each time */
\
686
/* store the results and increment the destination pointer within the loopCode */
\
687
}
688
689
#define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
690
loopCode;
/* exceptional cases where the count isn't a multiple of 4 */
\
691
/* store the results within the loopCode */
\
692
}
693
694
#define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
695
ne10_result_t res = NE10_OK; \
696
float32x4x3_t n_src; \
697
float32x4_t n_dst; \
698
checkPointer; \
699
int dif = count % 4; \
700
for (; count > dif; count -= 4) { \
701
loopCode1; \
702
} \
703
if ( 0 != dif ) { \
704
unsigned int idx; \
705
for ( idx = 0 ; idx < dif; idx++ ) { \
706
loopCode2; \
707
} \
708
} \
709
return res; \
710
}
711
713
714
/* Note that for the VEC4* types, we do not need a second loop as the number
715
of input items is always a multiple of four. */
716
717
#define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
718
n_src = vld1q_f32( (float32_t*)src ); \
719
src ++; \
720
loopCode; \
721
/* store the results and increment the destination pointer within the loopCode */
\
722
}
723
724
#define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
725
ne10_result_t res = NE10_OK; \
726
float32x4_t n_src; \
727
checkPointer; \
728
for (; count != 0; count --) { \
729
loopCode; \
730
} \
731
return res; \
732
}
733
Generated on Tue Mar 24 2020 14:08:13 for Project Ne10 by
1.8.17