00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #ifndef EIGEN_INVERSE_SSE_H
00043 #define EIGEN_INVERSE_SSE_H
00044
00045 namespace Eigen {
00046
00047 namespace internal {
00048
00049 template<typename MatrixType, typename ResultType>
00050 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
00051 {
00052 enum {
00053 MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
00054 ResultAlignment = bool(ResultType::Flags&AlignedBit),
00055 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
00056 };
00057
00058 static void run(const MatrixType& matrix, ResultType& result)
00059 {
00060 EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
00061
00062
00063 __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
00064 __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
00065 __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
00066 __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
00067
00068
00069
00070
00071
00072
00073
00074 __m128 A, B, C, D;
00075 if(!StorageOrdersMatch)
00076 {
00077 A = _mm_unpacklo_ps(_L1, _L2);
00078 B = _mm_unpacklo_ps(_L3, _L4);
00079 C = _mm_unpackhi_ps(_L1, _L2);
00080 D = _mm_unpackhi_ps(_L3, _L4);
00081 }
00082 else
00083 {
00084 A = _mm_movelh_ps(_L1, _L2);
00085 B = _mm_movehl_ps(_L2, _L1);
00086 C = _mm_movelh_ps(_L3, _L4);
00087 D = _mm_movehl_ps(_L4, _L3);
00088 }
00089
00090 __m128 iA, iB, iC, iD,
00091 DC, AB;
00092 __m128 dA, dB, dC, dD;
00093 __m128 det, d, d1, d2;
00094 __m128 rd;
00095
00096
00097 AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
00098 AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
00099
00100 DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
00101 DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
00102
00103
00104 dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
00105 dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
00106
00107 dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
00108 dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
00109
00110
00111 dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
00112 dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
00113
00114 dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
00115 dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
00116
00117
00118 d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
00119
00120
00121 iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
00122 iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
00123
00124 iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
00125 iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
00126
00127
00128 d = _mm_add_ps(d, _mm_movehl_ps(d, d));
00129 d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
00130 d1 = _mm_mul_ss(dA,dD);
00131 d2 = _mm_mul_ss(dB,dC);
00132
00133
00134 iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
00135
00136
00137 iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
00138
00139
00140 det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
00141 rd = _mm_div_ss(_mm_set_ss(1.0f), det);
00142
00143
00144
00145
00146
00147
00148 iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
00149 iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
00150
00151 iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
00152 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
00153
00154 rd = _mm_shuffle_ps(rd,rd,0);
00155 rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
00156
00157
00158 iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
00159
00160
00161 iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
00162
00163
00164 iA = _mm_mul_ps(rd,iA);
00165 iB = _mm_mul_ps(rd,iB);
00166 iC = _mm_mul_ps(rd,iC);
00167 iD = _mm_mul_ps(rd,iD);
00168
00169 result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
00170 result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
00171 result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
00172 result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
00173 }
00174
00175 };
00176
00177 template<typename MatrixType, typename ResultType>
00178 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
00179 {
00180 enum {
00181 MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
00182 ResultAlignment = bool(ResultType::Flags&AlignedBit),
00183 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
00184 };
00185 static void run(const MatrixType& matrix, ResultType& result)
00186 {
00187 const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
00188 const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
00189
00190
00191
00192
00193
00194
00195
00196
00197 __m128d A1, A2, B1, B2, C1, C2, D1, D2;
00198
00199 if(StorageOrdersMatch)
00200 {
00201 A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
00202 A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
00203 C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
00204 C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
00205 }
00206 else
00207 {
00208 __m128d tmp;
00209 A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
00210 A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
00211 tmp = A1;
00212 A1 = _mm_unpacklo_pd(A1,A2);
00213 A2 = _mm_unpackhi_pd(tmp,A2);
00214 tmp = C1;
00215 C1 = _mm_unpacklo_pd(C1,C2);
00216 C2 = _mm_unpackhi_pd(tmp,C2);
00217
00218 B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
00219 B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
00220 tmp = B1;
00221 B1 = _mm_unpacklo_pd(B1,B2);
00222 B2 = _mm_unpackhi_pd(tmp,B2);
00223 tmp = D1;
00224 D1 = _mm_unpacklo_pd(D1,D2);
00225 D2 = _mm_unpackhi_pd(tmp,D2);
00226 }
00227
00228 __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,
00229 DC1, DC2, AB1, AB2;
00230 __m128d dA, dB, dC, dD;
00231 __m128d det, d1, d2, rd;
00232
00233
00234 dA = _mm_shuffle_pd(A2, A2, 1);
00235 dA = _mm_mul_pd(A1, dA);
00236 dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
00237
00238 dB = _mm_shuffle_pd(B2, B2, 1);
00239 dB = _mm_mul_pd(B1, dB);
00240 dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
00241
00242
00243 AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
00244 AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
00245 AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
00246 AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
00247
00248
00249 dC = _mm_shuffle_pd(C2, C2, 1);
00250 dC = _mm_mul_pd(C1, dC);
00251 dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
00252
00253 dD = _mm_shuffle_pd(D2, D2, 1);
00254 dD = _mm_mul_pd(D1, dD);
00255 dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
00256
00257
00258 DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
00259 DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
00260 DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
00261 DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
00262
00263
00264 d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
00265 d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
00266 rd = _mm_add_pd(d1, d2);
00267 rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
00268
00269
00270 iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
00271 iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
00272 iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
00273 iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
00274
00275
00276 iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
00277 iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
00278 iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
00279 iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
00280
00281
00282 dA = _mm_shuffle_pd(dA,dA,0);
00283 iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
00284 iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
00285
00286
00287 dD = _mm_shuffle_pd(dD,dD,0);
00288 iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
00289 iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
00290
00291 d1 = _mm_mul_sd(dA, dD);
00292 d2 = _mm_mul_sd(dB, dC);
00293
00294
00295 iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
00296 iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
00297 iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
00298 iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
00299
00300
00301 det = _mm_add_sd(d1, d2);
00302 det = _mm_sub_sd(det, rd);
00303
00304
00305 iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
00306 iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
00307 iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
00308 iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
00309
00310 rd = _mm_div_sd(_mm_set_sd(1.0), det);
00311
00312
00313
00314 rd = _mm_shuffle_pd(rd,rd,0);
00315
00316
00317 dB = _mm_shuffle_pd(dB,dB,0);
00318 iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
00319 iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
00320
00321 d1 = _mm_xor_pd(rd, _Sign_PN);
00322 d2 = _mm_xor_pd(rd, _Sign_NP);
00323
00324
00325 dC = _mm_shuffle_pd(dC,dC,0);
00326 iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
00327 iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
00328
00329 result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
00330 result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
00331 result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
00332 result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
00333 result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
00334 result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
00335 result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
00336 result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
00337 }
00338 };
00339
00340 }
00341
00342 }
00343
00344 #endif // EIGEN_INVERSE_SSE_H