6 #ifndef _EULE_NO_INTRINSICS_
15 for (std::size_t
i = 0;
i < 4;
i++)
16 for (std::size_t
j = 0;
j < 4;
j++)
17 v[
i][
j] =
double(
i ==
j);
30 v = std::move(other.v);
39 #ifndef _EULE_NO_INTRINSICS_
45 __m256d __va1 = _mm256_set_pd(
v[0][0],
v[0][0],
v[0][0],
v[1][0]);
46 __m256d __va2 = _mm256_set_pd(
v[1][0],
v[1][0],
v[2][0],
v[2][0]);
48 __m256d __oa1 = _mm256_set_pd(other[0][0], other[0][1], other[0][2], other[0][0]);
49 __m256d __oa2 = _mm256_set_pd(other[0][1], other[0][2], other[0][0], other[0][1]);
51 __m256d __vb1 = _mm256_set_pd(
v[0][1],
v[0][1],
v[0][1],
v[1][1]);
52 __m256d __vb2 = _mm256_set_pd(
v[1][1],
v[1][1],
v[2][1],
v[2][1]);
54 __m256d __ob1 = _mm256_set_pd(other[1][0], other[1][1], other[1][2], other[1][0]);
55 __m256d __ob2 = _mm256_set_pd(other[1][1], other[1][2], other[1][0], other[1][1]);
57 __m256d __vc1 = _mm256_set_pd(
v[0][2],
v[0][2],
v[0][2],
v[1][2]);
58 __m256d __vc2 = _mm256_set_pd(
v[1][2],
v[1][2],
v[2][2],
v[2][2]);
60 __m256d __oc1 = _mm256_set_pd(other[2][0], other[2][1], other[2][2], other[2][0]);
61 __m256d __oc2 = _mm256_set_pd(other[2][1], other[2][2], other[2][0], other[2][1]);
64 __m256d __sum1 = _mm256_set1_pd(0);
65 __m256d __sum2 = _mm256_set1_pd(0);
69 __sum1 = _mm256_fmadd_pd(__va1, __oa1, __sum1);
70 __sum1 = _mm256_fmadd_pd(__vb1, __ob1, __sum1);
71 __sum1 = _mm256_fmadd_pd(__vc1, __oc1, __sum1);
74 __sum2 = _mm256_fmadd_pd(__va2, __oa2, __sum2);
75 __sum2 = _mm256_fmadd_pd(__vb2, __ob2, __sum2);
76 __sum2 = _mm256_fmadd_pd(__vc2, __oc2, __sum2);
82 _mm256_storeu_pd(sum1, __sum1);
83 _mm256_storeu_pd(sum2, __sum2);
87 newMatrix[0][0] = sum1[3];
88 newMatrix[0][1] = sum1[2];
89 newMatrix[0][2] = sum1[1];
90 newMatrix[1][0] = sum1[0];
93 newMatrix[1][1] = sum2[3];
94 newMatrix[1][2] = sum2[2];
95 newMatrix[2][0] = sum2[1];
96 newMatrix[2][1] = sum2[0];
99 newMatrix[2][2] = (
v[2][0] * other[0][2]) + (
v[2][1] * other[1][2]) + (
v[2][2] * other[2][2]);
105 __m256d __transSelf = _mm256_set_pd(0,
l,
h,
d);
106 __m256d __transOther = _mm256_set_pd(0, other.l, other.h, other.d);
109 __m256d __sum = _mm256_add_pd(__transSelf, __transOther);
113 _mm256_storeu_pd(sum, __sum);
116 newMatrix.
d = sum[0];
117 newMatrix.
h = sum[1];
118 newMatrix.
l = sum[2];
124 newMatrix[0][0] = (
v[0][0] * other[0][0]) + (
v[0][1] * other[1][0]) + (
v[0][2] * other[2][0]);
125 newMatrix[0][1] = (
v[0][0] * other[0][1]) + (
v[0][1] * other[1][1]) + (
v[0][2] * other[2][1]);
126 newMatrix[0][2] = (
v[0][0] * other[0][2]) + (
v[0][1] * other[1][2]) + (
v[0][2] * other[2][2]);
128 newMatrix[1][0] = (
v[1][0] * other[0][0]) + (
v[1][1] * other[1][0]) + (
v[1][2] * other[2][0]);
129 newMatrix[1][1] = (
v[1][0] * other[0][1]) + (
v[1][1] * other[1][1]) + (
v[1][2] * other[2][1]);
130 newMatrix[1][2] = (
v[1][0] * other[0][2]) + (
v[1][1] * other[1][2]) + (
v[1][2] * other[2][2]);
132 newMatrix[2][0] = (
v[2][0] * other[0][0]) + (
v[2][1] * other[1][0]) + (
v[2][2] * other[2][0]);
133 newMatrix[2][1] = (
v[2][0] * other[0][1]) + (
v[2][1] * other[1][1]) + (
v[2][2] * other[2][1]);
134 newMatrix[2][2] = (
v[2][0] * other[0][2]) + (
v[2][1] * other[1][2]) + (
v[2][2] * other[2][2]);
138 newMatrix[0][3] =
v[0][3] + other[0][3];
139 newMatrix[1][3] =
v[1][3] + other[1][3];
140 newMatrix[2][3] =
v[2][3] + other[2][3];
149 *
this = *
this * other;
168 #ifndef _EULE_NO_INTRINSICS_
171 __m256d __row0 = _mm256_set_pd(
v[0][3],
v[0][2],
v[0][1],
v[0][0]);
172 __m256d __row1 = _mm256_set_pd(
v[1][3],
v[1][2],
v[1][1],
v[1][0]);
173 __m256d __row2 = _mm256_set_pd(
v[2][3],
v[2][2],
v[2][1],
v[2][0]);
174 __m256d __row3 = _mm256_set_pd(
v[3][3],
v[3][2],
v[3][1],
v[3][0]);
177 __m256d __scalar = _mm256_set1_pd(scalar);
180 __m256d __sr0 = _mm256_mul_pd(__row0, __scalar);
181 __m256d __sr1 = _mm256_mul_pd(__row1, __scalar);
182 __m256d __sr2 = _mm256_mul_pd(__row2, __scalar);
183 __m256d __sr3 = _mm256_mul_pd(__row3, __scalar);
186 _mm256_storeu_pd(
m.v[0].data(), __sr0);
187 _mm256_storeu_pd(
m.v[1].data(), __sr1);
188 _mm256_storeu_pd(
m.v[2].data(), __sr2);
189 _mm256_storeu_pd(
m.v[3].data(), __sr3);
193 for (std::size_t x = 0; x < 4; x++)
194 for (std::size_t y = 0; y < 4; y++)
195 m[x][y] =
v[x][y] * scalar;
204 *
this = *
this * scalar;
210 const double precomputeDivision = 1.0 / denominator;
212 return *
this * precomputeDivision;
217 *
this = *
this / denominator;
225 #ifndef _EULE_NO_INTRINSICS_
228 __m256d __row0a = _mm256_set_pd(
v[0][3],
v[0][2],
v[0][1],
v[0][0]);
229 __m256d __row1a = _mm256_set_pd(
v[1][3],
v[1][2],
v[1][1],
v[1][0]);
230 __m256d __row2a = _mm256_set_pd(
v[2][3],
v[2][2],
v[2][1],
v[2][0]);
231 __m256d __row3a = _mm256_set_pd(
v[3][3],
v[3][2],
v[3][1],
v[3][0]);
233 __m256d __row0b = _mm256_set_pd(other[0][3], other[0][2], other[0][1], other[0][0]);
234 __m256d __row1b = _mm256_set_pd(other[1][3], other[1][2], other[1][1], other[1][0]);
235 __m256d __row2b = _mm256_set_pd(other[2][3], other[2][2], other[2][1], other[2][0]);
236 __m256d __row3b = _mm256_set_pd(other[3][3], other[3][2], other[3][1], other[3][0]);
239 __m256d __sr0 = _mm256_add_pd(__row0a, __row0b);
240 __m256d __sr1 = _mm256_add_pd(__row1a, __row1b);
241 __m256d __sr2 = _mm256_add_pd(__row2a, __row2b);
242 __m256d __sr3 = _mm256_add_pd(__row3a, __row3b);
245 _mm256_storeu_pd(
m.v[0].data(), __sr0);
246 _mm256_storeu_pd(
m.v[1].data(), __sr1);
247 _mm256_storeu_pd(
m.v[2].data(), __sr2);
248 _mm256_storeu_pd(
m.v[3].data(), __sr3);
252 for (std::size_t x = 0; x < 4; x++)
253 for (std::size_t y = 0; y < 4; y++)
254 m[x][y] =
v[x][y] + other[x][y];
263 #ifndef _EULE_NO_INTRINSICS_
267 __m256d __row0a = _mm256_set_pd(
v[0][3],
v[0][2],
v[0][1],
v[0][0]);
268 __m256d __row1a = _mm256_set_pd(
v[1][3],
v[1][2],
v[1][1],
v[1][0]);
269 __m256d __row2a = _mm256_set_pd(
v[2][3],
v[2][2],
v[2][1],
v[2][0]);
270 __m256d __row3a = _mm256_set_pd(
v[3][3],
v[3][2],
v[3][1],
v[3][0]);
272 __m256d __row0b = _mm256_set_pd(other[0][3], other[0][2], other[0][1], other[0][0]);
273 __m256d __row1b = _mm256_set_pd(other[1][3], other[1][2], other[1][1], other[1][0]);
274 __m256d __row2b = _mm256_set_pd(other[2][3], other[2][2], other[2][1], other[2][0]);
275 __m256d __row3b = _mm256_set_pd(other[3][3], other[3][2], other[3][1], other[3][0]);
278 __m256d __sr0 = _mm256_add_pd(__row0a, __row0b);
279 __m256d __sr1 = _mm256_add_pd(__row1a, __row1b);
280 __m256d __sr2 = _mm256_add_pd(__row2a, __row2b);
281 __m256d __sr3 = _mm256_add_pd(__row3a, __row3b);
284 _mm256_storeu_pd(
v[0].data(), __sr0);
285 _mm256_storeu_pd(
v[1].data(), __sr1);
286 _mm256_storeu_pd(
v[2].data(), __sr2);
287 _mm256_storeu_pd(
v[3].data(), __sr3);
291 *
this = *
this + other;
302 #ifndef _EULE_NO_INTRINSICS_
305 __m256d __row0a = _mm256_set_pd(
v[0][3],
v[0][2],
v[0][1],
v[0][0]);
306 __m256d __row1a = _mm256_set_pd(
v[1][3],
v[1][2],
v[1][1],
v[1][0]);
307 __m256d __row2a = _mm256_set_pd(
v[2][3],
v[2][2],
v[2][1],
v[2][0]);
308 __m256d __row3a = _mm256_set_pd(
v[3][3],
v[3][2],
v[3][1],
v[3][0]);
310 __m256d __row0b = _mm256_set_pd(other[0][3], other[0][2], other[0][1], other[0][0]);
311 __m256d __row1b = _mm256_set_pd(other[1][3], other[1][2], other[1][1], other[1][0]);
312 __m256d __row2b = _mm256_set_pd(other[2][3], other[2][2], other[2][1], other[2][0]);
313 __m256d __row3b = _mm256_set_pd(other[3][3], other[3][2], other[3][1], other[3][0]);
316 __m256d __sr0 = _mm256_sub_pd(__row0a, __row0b);
317 __m256d __sr1 = _mm256_sub_pd(__row1a, __row1b);
318 __m256d __sr2 = _mm256_sub_pd(__row2a, __row2b);
319 __m256d __sr3 = _mm256_sub_pd(__row3a, __row3b);
322 _mm256_storeu_pd(
m.v[0].data(), __sr0);
323 _mm256_storeu_pd(
m.v[1].data(), __sr1);
324 _mm256_storeu_pd(
m.v[2].data(), __sr2);
325 _mm256_storeu_pd(
m.v[3].data(), __sr3);
329 for (std::size_t x = 0; x < 4; x++)
330 for (std::size_t y = 0; y < 4; y++)
331 m[x][y] =
v[x][y] - other[x][y];
340 #ifndef _EULE_NO_INTRINSICS_
344 __m256d __row0a = _mm256_set_pd(
v[0][3],
v[0][2],
v[0][1],
v[0][0]);
345 __m256d __row1a = _mm256_set_pd(
v[1][3],
v[1][2],
v[1][1],
v[1][0]);
346 __m256d __row2a = _mm256_set_pd(
v[2][3],
v[2][2],
v[2][1],
v[2][0]);
347 __m256d __row3a = _mm256_set_pd(
v[3][3],
v[3][2],
v[3][1],
v[3][0]);
349 __m256d __row0b = _mm256_set_pd(other[0][3], other[0][2], other[0][1], other[0][0]);
350 __m256d __row1b = _mm256_set_pd(other[1][3], other[1][2], other[1][1], other[1][0]);
351 __m256d __row2b = _mm256_set_pd(other[2][3], other[2][2], other[2][1], other[2][0]);
352 __m256d __row3b = _mm256_set_pd(other[3][3], other[3][2], other[3][1], other[3][0]);
355 __m256d __sr0 = _mm256_sub_pd(__row0a, __row0b);
356 __m256d __sr1 = _mm256_sub_pd(__row1a, __row1b);
357 __m256d __sr2 = _mm256_sub_pd(__row2a, __row2b);
358 __m256d __sr3 = _mm256_sub_pd(__row3a, __row3b);
361 _mm256_storeu_pd(
v[0].data(), __sr0);
362 _mm256_storeu_pd(
v[1].data(), __sr1);
363 _mm256_storeu_pd(
v[2].data(), __sr2);
364 _mm256_storeu_pd(
v[3].data(), __sr3);
368 *
this = *
this - other;
393 v = std::move(other.v);
433 for (std::size_t
i = 0;
i < 3;
i++)
434 for (std::size_t
j = 0;
j < 3;
j++)
435 trans[
j][
i] =
v[
i][
j];
444 for (std::size_t
i = 0;
i < 4;
i++)
445 for (std::size_t
j = 0;
j < 4;
j++)
446 trans[
j][
i] =
v[
i][
j];
455 m[0][0] = (
v[0][0]*
o[0][0]) + (
v[0][1]*
o[1][0]) + (
v[0][2]*
o[2][0]) + (
v[0][3]*
o[3][0]);
456 m[0][1] = (
v[0][0]*
o[0][1]) + (
v[0][1]*
o[1][1]) + (
v[0][2]*
o[2][1]) + (
v[0][3]*
o[3][1]);
457 m[0][2] = (
v[0][0]*
o[0][2]) + (
v[0][1]*
o[1][2]) + (
v[0][2]*
o[2][2]) + (
v[0][3]*
o[3][2]);
458 m[0][3] = (
v[0][0]*
o[0][3]) + (
v[0][1]*
o[1][3]) + (
v[0][2]*
o[2][3]) + (
v[0][3]*
o[3][3]);
460 m[1][0] = (
v[1][0]*
o[0][0]) + (
v[1][1]*
o[1][0]) + (
v[1][2]*
o[2][0]) + (
v[1][3]*
o[3][0]);
461 m[1][1] = (
v[1][0]*
o[0][1]) + (
v[1][1]*
o[1][1]) + (
v[1][2]*
o[2][1]) + (
v[1][3]*
o[3][1]);
462 m[1][2] = (
v[1][0]*
o[0][2]) + (
v[1][1]*
o[1][2]) + (
v[1][2]*
o[2][2]) + (
v[1][3]*
o[3][2]);
463 m[1][3] = (
v[1][0]*
o[0][3]) + (
v[1][1]*
o[1][3]) + (
v[1][2]*
o[2][3]) + (
v[1][3]*
o[3][3]);
465 m[2][0] = (
v[2][0]*
o[0][0]) + (
v[2][1]*
o[1][0]) + (
v[2][2]*
o[2][0]) + (
v[2][3]*
o[3][0]);
466 m[2][1] = (
v[2][0]*
o[0][1]) + (
v[2][1]*
o[1][1]) + (
v[2][2]*
o[2][1]) + (
v[2][3]*
o[3][1]);
467 m[2][2] = (
v[2][0]*
o[0][2]) + (
v[2][1]*
o[1][2]) + (
v[2][2]*
o[2][2]) + (
v[2][3]*
o[3][2]);
468 m[2][3] = (
v[2][0]*
o[0][3]) + (
v[2][1]*
o[1][3]) + (
v[2][2]*
o[2][3]) + (
v[2][3]*
o[3][3]);
470 m[3][0] = (
v[3][0]*
o[0][0]) + (
v[3][1]*
o[1][0]) + (
v[3][2]*
o[2][0]) + (
v[3][3]*
o[3][0]);
471 m[3][1] = (
v[3][0]*
o[0][1]) + (
v[3][1]*
o[1][1]) + (
v[3][2]*
o[2][1]) + (
v[3][3]*
o[3][1]);
472 m[3][2] = (
v[3][0]*
o[0][2]) + (
v[3][1]*
o[1][2]) + (
v[3][2]*
o[2][2]) + (
v[3][3]*
o[3][2]);
473 m[3][3] = (
v[3][0]*
o[0][3]) + (
v[3][1]*
o[1][3]) + (
v[3][2]*
o[2][3]) + (
v[3][3]*
o[3][3]);
481 throw std::runtime_error(
"Dimension out of range! 0 <= n <= 4");
488 for (std::size_t y = 0; y <
n; y++)
489 for (std::size_t x = 0; x <
n; x++)
491 if ((y !=
p) && (x != q))
493 cofs[
i][
j] =
v[y][x];
514 throw std::runtime_error(
"Dimension out of range! 0 <= n <= 4");
522 for (std::size_t x = 0; x <
n; x++)
536 throw std::runtime_error(
"Dimension out of range! 0 <= n <= 4");
541 for (std::size_t
i = 0;
i <
n;
i++)
542 for (std::size_t
j = 0;
j <
n;
j++)
548 sign = ((
i +
j) % 2 == 0) ? 1 : -1;
564 throw std::runtime_error(
"Matrix3x3 not inversible!");
568 for (std::size_t
i = 0;
i < 3;
i++)
569 for (std::size_t
j = 0;
j < 3;
j++)
570 inv[
i][
j] = adj[
i][
j] / det;
583 throw std::runtime_error(
"Matrix4x4 not inversible!");
587 for (std::size_t
i = 0;
i < 4;
i++)
588 for (std::size_t
j = 0;
j < 4;
j++)
589 inv[
i][
j] = adj[
i][
j] / det;
610 for (std::size_t
i = 0;
i < 4;
i++)
611 for (std::size_t
j = 0;
j < 4;
j++)
624 for (std::size_t y = 0; y < 4; y++)
626 for (std::size_t x = 0; x < 4; x++)
627 os <<
" | " << m[y][x];
629 os <<
" |" << std::endl;
639 for (std::size_t y = 0; y < 4; y++)
641 for (std::size_t x = 0; x < 4; x++)
642 os << L
" | " << m[y][x];
644 os << L
" |" << std::endl;