-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatrix_impl.cpp
474 lines (375 loc) · 19.5 KB
/
matrix_impl.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
/*
VFP math library for the iPhone / iPod touch
Copyright (c) 2007-2008 Wolfgang Engel and Matthias Grundmann
http://code.google.com/p/vfpmathlibrary/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising
from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must
not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifdef __APPLE__
#include <TargetConditionals.h>
#if (TARGET_IPHONE_SIMULATOR == 0) && (TARGET_OS_IPHONE == 1)
#include "matrix_impl.h"
#include "common_macros.h"
#include "vfp_clobbers.h"
void Matrix4Mul(const float* src_mat_1, const float* src_mat_2, float* dst_mat) {
asm volatile (VFP_SWITCH_TO_ARM
VFP_VECTOR_LENGTH(3)
// Interleaving loads and adds/muls for faster calculation.
// Let A:=src_ptr_1, B:=src_ptr_2, then
// function computes A*B as (B^T * A^T)^T.
// Load the whole matrix into memory.
"fldmias %2, {s8-s23} \n\t"
// Load first column to scalar bank.
"fldmias %1!, {s0-s3} \n\t"
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
// Load second column to scalar bank.
"fldmias %1!, {s4-s7} \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// Save first column.
"fstmias %0!, {s24-s27} \n\t"
// Second column times matrix.
"fmuls s28, s8, s4 \n\t"
"fmacs s28, s12, s5 \n\t"
// Load third column to scalar bank.
"fldmias %1!, {s0-s3} \n\t"
"fmacs s28, s16, s6 \n\t"
"fmacs s28, s20, s7 \n\t"
// Save second column.
"fstmias %0!, {s28-s31} \n\t"
// Third column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
// Load fourth column to scalar bank.
"fldmias %1, {s4-s7} \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// Save third column.
"fstmias %0!, {s24-s27} \n\t"
// Fourth column times matrix.
"fmuls s28, s8, s4 \n\t"
"fmacs s28, s12, s5 \n\t"
"fmacs s28, s16, s6 \n\t"
"fmacs s28, s20, s7 \n\t"
// Save fourth column.
"fstmias %0!, {s28-s31} \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
: "=r" (dst_mat), "=r" (src_mat_2)
: "r" (src_mat_1), "0" (dst_mat), "1" (src_mat_2)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S31
);
}
// Load the matrix transposed into memory.
// Uses registers 24 - 31 as scratch.
// Increments sources address in register %0.
#define VFP_LOAD_MATRIX0_T_TO_S8_S23 "fldmias %0!, {s8-s11} \n\t" \
"fcpys s12, s9 \n\t" \
"fcpys s16, s10 \n\t" \
"fldmias %0!, {s24-s27} \n\t" \
"fcpys s20, s11 \n\t" \
"fcpys s9, s24 \n\t" \
"fcpys s13, s25 \n\t" \
"fldmias %0!, {s28-s31} \n\t" \
"fcpys s17, s26 \n\t" \
"fcpys s21, s27 \n\t" \
\
"fcpys s10, s28 \n\t" \
"fcpys s14, s29 \n\t" \
"fldmias %0!, {s24-s27} \n\t" \
"fcpys s18, s30 \n\t" \
"fcpys s22, s31 \n\t" \
\
"fcpys s11, s24 \n\t" \
"fcpys s15, s25 \n\t" \
\
"fcpys s19, s26 \n\t" \
"fcpys s23, s27 \n\t"
// s0 s1 s2 s3
// s4 s5 s6 s7
// s8 s9 s10 s11
// s12 s13 s14 s15
// Load the matrix transposed into memory.
// Uses registers 24 - 31 as scratch.
// Increments sources address in register %0.
#define VFP_LOAD_MATRIX0_T_TO_S0_S15 "fldmias %0!, {s0-s3} \n\t" \
"fcpys s4, s1 \n\t" \
"fcpys s8, s2 \n\t" \
"fldmias %0!, {s24-s27} \n\t" \
"fcpys s12, s3 \n\t" \
\
"fcpys s1, s24 \n\t" \
"fcpys s5, s25 \n\t" \
"fldmias %0!, {s28-s31} \n\t" \
"fcpys s9, s26 \n\t" \
"fcpys s13, s27 \n\t" \
\
"fcpys s2, s28 \n\t" \
"fcpys s6, s29 \n\t" \
"fldmias %0!, {s24-s27} \n\t" \
"fcpys s10, s30 \n\t" \
"fcpys s14, s31 \n\t" \
\
"fcpys s3, s24 \n\t" \
"fcpys s7, s25 \n\t" \
"fcpys s11, s26 \n\t" \
"fcpys s15, s27 \n\t"
void Matrix4Vector4Mul(const float* src_mat, const float* src_vec, float* dst_vec) {
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix.
"fldmias %0, {s8-s23} \n\t"
// Load vector to scalar bank.
"fldmias %1, {s0-s3} \n\t"
VFP_VECTOR_LENGTH(3)
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// Save vector.
"fstmias %2, {s24-s27} \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
:
: "r" (src_mat), "r" (src_vec), "r" (dst_vec)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S3, VFP_CLOBBER_S8_S27
);
}
void Matrix4Vector3Mul(const float* src_mat, const float* src_vec, float* dst_vec) {
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix.
"fldmias %0, {s8-s23} \n\t"
// Load vector to scalar bank.
"fldmias %1, {s0-s2} \n\t"
VFP_VECTOR_LENGTH(3)
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fadds s24, s24, s20 \n\t"
// Save vector.
"fstmias %2, {s24-s27} \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
:
: "r" (src_mat), "r" (src_vec), "r" (dst_vec)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S2, VFP_CLOBBER_S8_S27
);
}
void Matrix4Vector3Mul(const float* src_mat, const float* src_vec, float w, float* dst_vec) {
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix.
"fldmias %0, {s8-s23} \n\t"
// Load vector to scalar bank.
"fldmias %1, {s0-s2} \n\t"
// Load w.
"fmsr s3, %3 \n\t"
VFP_VECTOR_LENGTH(3)
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// Save vector.
"fstmias %2, {s24-s27} \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
:
: "r" (src_mat), "r" (src_vec), "r" (dst_vec), "r" (w)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S3, VFP_CLOBBER_S8_S27
);
}
void Matrix4Vector3ArrayMul(int num, const float* src_mat, int src_stride,
const float* src_vec_array, int dst_stride,
float* dst_vec_array) {
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix.
"fldmias %2, {s8-s23} \n\t"
VFP_VECTOR_LENGTH(3)
"L2000: \n\t"
// Load vector to scalar bank.
"fldmias %0, {s0-s2} \n\t"
"adds %0, %0, %3 \n\t"
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fadds s24, s24, s20 \n\t"
// Save vector.
"fstmias %1, {s24-s27} \n\t"
"adds %1, %1, %4 \n\t"
"subs %5, %5, #1 \n\t"
"bne L2000 \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
: "=r" (src_vec_array), "=r" (dst_vec_array)
: "r" (src_mat), "r" (src_stride), "r" (dst_stride), "r" (num),
"0" (src_vec_array), "1" (dst_vec_array)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S2, VFP_CLOBBER_S8_S27
);
}
void Matrix4Vector3ArrayMul(int num, const float* src_mat, float w, int src_stride,
const float* src_vec_array, int dst_stride,
float* dst_vec_array) {
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix.
"fldmias %2, {s8-s23} \n\t"
"fmsr s3, %6 \n\t"
VFP_VECTOR_LENGTH(3)
"L2010: \n\t"
// Load vector to scalar bank.
"fldmias %0, {s0-s2} \n\t"
"adds %0, %0, %3 \n\t"
// First column times matrix.
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// Save vector.
"fstmias %1, {s24-s27} \n\t"
"adds %1, %1, %4 \n\t"
"subs %5, %5, #1 \n\t"
"bne L2010 \n\t"
VFP_VECTOR_LENGTH_ZERO
VFP_SWITCH_TO_THUMB
: "=r" (src_vec_array), "=r" (dst_vec_array)
: "r" (src_mat), "r" (src_stride), "r" (dst_stride), "r" (num), "r" (w),
"0" (src_vec_array), "1" (dst_vec_array)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S3, VFP_CLOBBER_S8_S27
);
}
// Computes the inverse of a 4x4 matrix by using the Laplace Extension Theorem
// in 94 multiplications and 1 division.
// Comments in the code refer to David Eberly's definitions
// http://www.geometrictools.com/Documentation/LaplaceExpansionTheorem.pdf
// page 9 (retrieved Feb. 20, 2009).
void Matrix4Invert(const float* src_mat, float* dst_mat) {
const float one = 1.0;
asm volatile (VFP_SWITCH_TO_ARM
// Load the whole matrix transposed.
// Transposing is necessary to be able to execute inversion
// with only 32 fpregs.
VFP_LOAD_MATRIX0_T_TO_S0_S15
// Destination matrix will be {s8 - s23}.
// Mapping of matrix elements is
// s0 s1 s2 s3 a00 a01 a02 a03
// s4 s5 s6 s7 <= a10 a11 a12 a13
// s8 s9 s10 s11 a20 a21 a22 a23
// s12 s13 s14 s15 a30 a31 a32 a33
// Compute s_0 to s_5 (in s25 - s30).
"fmuls s25, s0, s5 \n\t"
"fnmacs s25, s4, s1 \n\t"
"fmuls s26, s0, s6 \n\t"
"fnmacs s26, s4, s2 \n\t"
"fmuls s27, s0, s7 \n\t"
"fnmacs s27, s4, s3 \n\t"
"fmuls s28, s1, s6 \n\t"
"fnmacs s28, s5, s2 \n\t"
"fmuls s29, s1, s7 \n\t"
"fnmacs s29, s5, s3 \n\t"
"fmuls s30, s2, s7 \n\t"
"fnmacs s30, s6, s3 \n\t"
// Process last two columns -> s16-s23.
// Last column.
"fnmuls s20, s9, s30 \n\t"
"fmacs s20, s10, s29 \n\t"
"fnmacs s20, s11, s28 \n\t"
"fmuls s21, s8, s30 \n\t"
"fnmacs s21, s10, s27 \n\t"
"fmacs s21, s11, s26 \n\t"
"fnmuls s22, s8, s29 \n\t"
"fmacs s22, s9, s27 \n\t"
"fnmacs s22, s11, s25 \n\t"
"fmuls s23, s8, s28 \n\t"
"fnmacs s23, s9, s26 \n\t"
"fmacs s23, s10, s25 \n\t"
// Third column.
"fmuls s16, s13, s30 \n\t"
"fnmacs s16, s14, s29 \n\t"
"fmacs s16, s15, s28 \n\t"
"fnmuls s17, s12, s30 \n\t"
"fmacs s17, s14, s27 \n\t"
"fnmacs s17, s15, s26 \n\t"
"fmuls s18, s12, s29 \n\t"
"fnmacs s18, s13, s27 \n\t"
"fmacs s18, s15, s25 \n\t"
"fnmuls s19, s12, s28 \n\t"
"fmacs s19, s13, s26 \n\t"
"fnmacs s19, s14, s25 \n\t"
// Compute c_5 to c_0 (in s24 - s29).
// Determinant will be in s31.
"fmuls s24, s10, s15 \n\t"
"fnmacs s24, s14, s11 \n\t"
"fmuls s31, s24, s25 \n\t"
"fmuls s25, s9, s15 \n\t"
"fnmacs s25, s13, s11 \n\t"
"fnmacs s31, s25, s26 \n\t"
"fmuls s26, s9, s14 \n\t"
"fnmacs s26, s13, s10 \n\t"
"fmacs s31, s26, s27 \n\t"
"fmuls s27, s8, s15 \n\t"
"fnmacs s27, s12, s11 \n\t"
"fmacs s31, s27, s28 \n\t"
"fmuls s28, s8, s14 \n\t"
"fnmacs s28, s12, s10 \n\t"
"fnmacs s31, s28, s29 \n\t"
"fmuls s29, s8, s13 \n\t"
"fnmacs s29, s12, s9 \n\t"
"fmacs s31, s29, s30 \n\t"
// Second column.
"fnmuls s12, s1, s24 \n\t"
"fmacs s12, s2, s25 \n\t"
"fnmacs s12, s3, s26 \n\t"
"fmuls s13, s0, s24 \n\t"
"fnmacs s13, s2, s27 \n\t"
"fmacs s13, s3, s28 \n\t"
"fnmuls s14, s0, s25 \n\t"
"fmacs s14, s1, s27 \n\t"
"fnmacs s14, s3, s29 \n\t"
"fmuls s15, s0, s26 \n\t"
"fnmacs s15, s1, s28 \n\t"
"fmacs s15, s2, s29 \n\t"
// First column.
"fmuls s8, s5, s24 \n\t"
"fnmacs s8, s6, s25 \n\t"
"fmacs s8, s7, s26 \n\t"
"fnmuls s9, s4, s24 \n\t"
"fmacs s9, s6, s27 \n\t"
"fnmacs s9, s7, s28 \n\t"
"fmuls s10, s4, s25 \n\t"
"fnmacs s10, s5, s27 \n\t"
"fmacs s10, s7, s29 \n\t"
"fnmuls s11, s4, s26 \n\t"
"fmacs s11, s5, s28 \n\t"
"fnmacs s11, s6, s29 \n\t"
// Multiply with inverse of determinant.
"fmsr s1, %2 \n\t"
"fdivs s0, s1, s31 \n\t"
VFP_VECTOR_LENGTH(7)
// Remember: destination matrix is in {s8 - s23}
"fmuls s24, s8, s0 \n\t"
"fstmias %1!, {s24-s31} \n\t"
"fmuls s8, s16, s0 \n\t"
"fstmias %1, {s8-s15} \n\t"
VFP_VECTOR_LENGTH(0)
VFP_SWITCH_TO_THUMB
: "=r" (src_mat), "=r" (dst_mat)
: "r" (one), "0" (src_mat), "1" (dst_mat)
: "r0", "cc", "memory", VFP_CLOBBER_S0_S31
);
}
#endif
#endif