00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00026 #define EIGEN_GENERAL_BLOCK_PANEL_H
00027
00028 namespace Eigen {
00029
00030 namespace internal {
00031
00032 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00033 class gebp_traits;
00034
00035
00037 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
00038 {
00039 return a<=0 ? b : a;
00040 }
00041
00043 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00044 {
00045 static std::ptrdiff_t m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
00046 static std::ptrdiff_t m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
00047
00048 if(action==SetAction)
00049 {
00050
00051 eigen_internal_assert(l1!=0 && l2!=0);
00052 m_l1CacheSize = *l1;
00053 m_l2CacheSize = *l2;
00054 }
00055 else if(action==GetAction)
00056 {
00057 eigen_internal_assert(l1!=0 && l2!=0);
00058 *l1 = m_l1CacheSize;
00059 *l2 = m_l2CacheSize;
00060 }
00061 else
00062 {
00063 eigen_internal_assert(false);
00064 }
00065 }
00066
00082 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00083 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00084 {
00085 EIGEN_UNUSED_VARIABLE(n);
00086
00087
00088
00089
00090
00091
00092
00093 std::ptrdiff_t l1, l2;
00094
00095 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00096 enum {
00097 kdiv = KcFactor * 2 * Traits::nr
00098 * Traits::RhsProgress * sizeof(RhsScalar),
00099 mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00100 mr_mask = (0xffffffff/mr)*mr
00101 };
00102
00103 manage_caching_sizes(GetAction, &l1, &l2);
00104 k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00105 std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00106 if(_m<m) m = _m & mr_mask;
00107 }
00108
00109 template<typename LhsScalar, typename RhsScalar>
00110 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00111 {
00112 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00113 }
00114
00115 #ifdef EIGEN_HAS_FUSE_CJMADD
00116 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
00117 #else
00118
00119
00120
00121 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00122 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& )
00123 {
00124 c = cj.pmadd(a,b,c);
00125 }
00126 };
00127
00128 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00129 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00130 {
00131 t = b; t = cj.pmul(a,t); c = padd(c,t);
00132 }
00133 };
00134
00135 template<typename CJ, typename A, typename B, typename C, typename T>
00136 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00137 {
00138 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00139 }
00140
00141 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
00142
00143 #endif
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00156 class gebp_traits
00157 {
00158 public:
00159 typedef _LhsScalar LhsScalar;
00160 typedef _RhsScalar RhsScalar;
00161 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00162
00163 enum {
00164 ConjLhs = _ConjLhs,
00165 ConjRhs = _ConjRhs,
00166 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00167 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00168 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00169 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00170
00171 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00172
00173
00174 nr = NumberOfRegisters/4,
00175
00176
00177 mr = 2 * LhsPacketSize,
00178
00179 WorkSpaceFactor = nr * RhsPacketSize,
00180
00181 LhsProgress = LhsPacketSize,
00182 RhsProgress = RhsPacketSize
00183 };
00184
00185 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00186 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00187 typedef typename packet_traits<ResScalar>::type _ResPacket;
00188
00189 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00190 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00191 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00192
00193 typedef ResPacket AccPacket;
00194
00195 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00196 {
00197 p = pset1<ResPacket>(ResScalar(0));
00198 }
00199
00200 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00201 {
00202 for(DenseIndex k=0; k<n; k++)
00203 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00204 }
00205
00206 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00207 {
00208 dest = pload<RhsPacket>(b);
00209 }
00210
00211 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00212 {
00213 dest = pload<LhsPacket>(a);
00214 }
00215
00216 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00217 {
00218 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00219 }
00220
00221 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00222 {
00223 r = pmadd(c,alpha,r);
00224 }
00225
00226 protected:
00227
00228
00229 };
00230
00231 template<typename RealScalar, bool _ConjLhs>
00232 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00233 {
00234 public:
00235 typedef std::complex<RealScalar> LhsScalar;
00236 typedef RealScalar RhsScalar;
00237 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00238
00239 enum {
00240 ConjLhs = _ConjLhs,
00241 ConjRhs = false,
00242 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00243 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00244 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00245 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00246
00247 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00248 nr = NumberOfRegisters/4,
00249 mr = 2 * LhsPacketSize,
00250 WorkSpaceFactor = nr*RhsPacketSize,
00251
00252 LhsProgress = LhsPacketSize,
00253 RhsProgress = RhsPacketSize
00254 };
00255
00256 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00257 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00258 typedef typename packet_traits<ResScalar>::type _ResPacket;
00259
00260 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00261 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00262 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00263
00264 typedef ResPacket AccPacket;
00265
00266 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00267 {
00268 p = pset1<ResPacket>(ResScalar(0));
00269 }
00270
00271 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00272 {
00273 for(DenseIndex k=0; k<n; k++)
00274 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00275 }
00276
00277 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00278 {
00279 dest = pload<RhsPacket>(b);
00280 }
00281
00282 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00283 {
00284 dest = pload<LhsPacket>(a);
00285 }
00286
00287 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00288 {
00289 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00290 }
00291
00292 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00293 {
00294 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00295 }
00296
00297 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00298 {
00299 c += a * b;
00300 }
00301
00302 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00303 {
00304 r = cj.pmadd(c,alpha,r);
00305 }
00306
00307 protected:
00308 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00309 };
00310
00311 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00312 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00313 {
00314 public:
00315 typedef std::complex<RealScalar> Scalar;
00316 typedef std::complex<RealScalar> LhsScalar;
00317 typedef std::complex<RealScalar> RhsScalar;
00318 typedef std::complex<RealScalar> ResScalar;
00319
00320 enum {
00321 ConjLhs = _ConjLhs,
00322 ConjRhs = _ConjRhs,
00323 Vectorizable = packet_traits<RealScalar>::Vectorizable
00324 && packet_traits<Scalar>::Vectorizable,
00325 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
00326 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00327
00328 nr = 2,
00329 mr = 2 * ResPacketSize,
00330 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00331
00332 LhsProgress = ResPacketSize,
00333 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00334 };
00335
00336 typedef typename packet_traits<RealScalar>::type RealPacket;
00337 typedef typename packet_traits<Scalar>::type ScalarPacket;
00338 struct DoublePacket
00339 {
00340 RealPacket first;
00341 RealPacket second;
00342 };
00343
00344 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
00345 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00346 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00347 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00348
00349 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00350
00351 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00352 {
00353 p.first = pset1<RealPacket>(RealScalar(0));
00354 p.second = pset1<RealPacket>(RealScalar(0));
00355 }
00356
00357
00358
00359
00360
00361 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00362 {
00363 for(DenseIndex k=0; k<n; k++)
00364 {
00365 if(Vectorizable)
00366 {
00367 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
00368 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
00369 }
00370 else
00371 b[k] = rhs[k];
00372 }
00373 }
00374
00375 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00376
00377 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00378 {
00379 dest.first = pload<RealPacket>((const RealScalar*)b);
00380 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00381 }
00382
00383
00384 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00385 {
00386 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00387 }
00388
00389 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& ) const
00390 {
00391 c.first = padd(pmul(a,b.first), c.first);
00392 c.second = padd(pmul(a,b.second),c.second);
00393 }
00394
00395 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const
00396 {
00397 c = cj.pmadd(a,b,c);
00398 }
00399
00400 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00401
00402 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00403 {
00404
00405 ResPacket tmp;
00406 if((!ConjLhs)&&(!ConjRhs))
00407 {
00408 tmp = pcplxflip(pconj(ResPacket(c.second)));
00409 tmp = padd(ResPacket(c.first),tmp);
00410 }
00411 else if((!ConjLhs)&&(ConjRhs))
00412 {
00413 tmp = pconj(pcplxflip(ResPacket(c.second)));
00414 tmp = padd(ResPacket(c.first),tmp);
00415 }
00416 else if((ConjLhs)&&(!ConjRhs))
00417 {
00418 tmp = pcplxflip(ResPacket(c.second));
00419 tmp = padd(pconj(ResPacket(c.first)),tmp);
00420 }
00421 else if((ConjLhs)&&(ConjRhs))
00422 {
00423 tmp = pcplxflip(ResPacket(c.second));
00424 tmp = psub(pconj(ResPacket(c.first)),tmp);
00425 }
00426
00427 r = pmadd(tmp,alpha,r);
00428 }
00429
00430 protected:
00431 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00432 };
00433
00434 template<typename RealScalar, bool _ConjRhs>
00435 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00436 {
00437 public:
00438 typedef std::complex<RealScalar> Scalar;
00439 typedef RealScalar LhsScalar;
00440 typedef Scalar RhsScalar;
00441 typedef Scalar ResScalar;
00442
00443 enum {
00444 ConjLhs = false,
00445 ConjRhs = _ConjRhs,
00446 Vectorizable = packet_traits<RealScalar>::Vectorizable
00447 && packet_traits<Scalar>::Vectorizable,
00448 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00449 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00450 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00451
00452 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00453 nr = 4,
00454 mr = 2*ResPacketSize,
00455 WorkSpaceFactor = nr*RhsPacketSize,
00456
00457 LhsProgress = ResPacketSize,
00458 RhsProgress = ResPacketSize
00459 };
00460
00461 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00462 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00463 typedef typename packet_traits<ResScalar>::type _ResPacket;
00464
00465 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00466 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00467 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00468
00469 typedef ResPacket AccPacket;
00470
00471 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00472 {
00473 p = pset1<ResPacket>(ResScalar(0));
00474 }
00475
00476 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00477 {
00478 for(DenseIndex k=0; k<n; k++)
00479 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00480 }
00481
00482 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00483 {
00484 dest = pload<RhsPacket>(b);
00485 }
00486
00487 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00488 {
00489 dest = ploaddup<LhsPacket>(a);
00490 }
00491
00492 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00493 {
00494 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00495 }
00496
00497 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00498 {
00499 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00500 }
00501
00502 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00503 {
00504 c += a * b;
00505 }
00506
00507 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00508 {
00509 r = cj.pmadd(alpha,c,r);
00510 }
00511
00512 protected:
00513 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00514 };
00515
00516
00517
00518
00519
00520
00521
00522
00523 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00524 struct gebp_kernel
00525 {
00526 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00527 typedef typename Traits::ResScalar ResScalar;
00528 typedef typename Traits::LhsPacket LhsPacket;
00529 typedef typename Traits::RhsPacket RhsPacket;
00530 typedef typename Traits::ResPacket ResPacket;
00531 typedef typename Traits::AccPacket AccPacket;
00532
00533 enum {
00534 Vectorizable = Traits::Vectorizable,
00535 LhsProgress = Traits::LhsProgress,
00536 RhsProgress = Traits::RhsProgress,
00537 ResPacketSize = Traits::ResPacketSize
00538 };
00539
00540 EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB
00541 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00542 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00543 {
00544 Traits traits;
00545
00546 if(strideA==-1) strideA = depth;
00547 if(strideB==-1) strideB = depth;
00548 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00549
00550 Index packet_cols = (cols/nr) * nr;
00551 const Index peeled_mc = (rows/mr)*mr;
00552
00553 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00554 const Index peeled_kc = (depth/4)*4;
00555
00556 if(unpackedB==0)
00557 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00558
00559
00560 for(Index j2=0; j2<packet_cols; j2+=nr)
00561 {
00562 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
00563
00564
00565
00566
00567 for(Index i=0; i<peeled_mc; i+=mr)
00568 {
00569 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00570 prefetch(&blA[0]);
00571
00572
00573 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00574 traits.initAcc(C0);
00575 traits.initAcc(C1);
00576 if(nr==4) traits.initAcc(C2);
00577 if(nr==4) traits.initAcc(C3);
00578 traits.initAcc(C4);
00579 traits.initAcc(C5);
00580 if(nr==4) traits.initAcc(C6);
00581 if(nr==4) traits.initAcc(C7);
00582
00583 ResScalar* r0 = &res[(j2+0)*resStride + i];
00584 ResScalar* r1 = r0 + resStride;
00585 ResScalar* r2 = r1 + resStride;
00586 ResScalar* r3 = r2 + resStride;
00587
00588 prefetch(r0+16);
00589 prefetch(r1+16);
00590 prefetch(r2+16);
00591 prefetch(r3+16);
00592
00593
00594
00595
00596 const RhsScalar* blB = unpackedB;
00597 for(Index k=0; k<peeled_kc; k+=4)
00598 {
00599 if(nr==2)
00600 {
00601 LhsPacket A0, A1;
00602 RhsPacket B_0;
00603 RhsPacket T0;
00604
00605 EIGEN_ASM_COMMENT("mybegin2");
00606 traits.loadLhs(&blA[0*LhsProgress], A0);
00607 traits.loadLhs(&blA[1*LhsProgress], A1);
00608 traits.loadRhs(&blB[0*RhsProgress], B_0);
00609 traits.madd(A0,B_0,C0,T0);
00610 traits.madd(A1,B_0,C4,B_0);
00611 traits.loadRhs(&blB[1*RhsProgress], B_0);
00612 traits.madd(A0,B_0,C1,T0);
00613 traits.madd(A1,B_0,C5,B_0);
00614
00615 traits.loadLhs(&blA[2*LhsProgress], A0);
00616 traits.loadLhs(&blA[3*LhsProgress], A1);
00617 traits.loadRhs(&blB[2*RhsProgress], B_0);
00618 traits.madd(A0,B_0,C0,T0);
00619 traits.madd(A1,B_0,C4,B_0);
00620 traits.loadRhs(&blB[3*RhsProgress], B_0);
00621 traits.madd(A0,B_0,C1,T0);
00622 traits.madd(A1,B_0,C5,B_0);
00623
00624 traits.loadLhs(&blA[4*LhsProgress], A0);
00625 traits.loadLhs(&blA[5*LhsProgress], A1);
00626 traits.loadRhs(&blB[4*RhsProgress], B_0);
00627 traits.madd(A0,B_0,C0,T0);
00628 traits.madd(A1,B_0,C4,B_0);
00629 traits.loadRhs(&blB[5*RhsProgress], B_0);
00630 traits.madd(A0,B_0,C1,T0);
00631 traits.madd(A1,B_0,C5,B_0);
00632
00633 traits.loadLhs(&blA[6*LhsProgress], A0);
00634 traits.loadLhs(&blA[7*LhsProgress], A1);
00635 traits.loadRhs(&blB[6*RhsProgress], B_0);
00636 traits.madd(A0,B_0,C0,T0);
00637 traits.madd(A1,B_0,C4,B_0);
00638 traits.loadRhs(&blB[7*RhsProgress], B_0);
00639 traits.madd(A0,B_0,C1,T0);
00640 traits.madd(A1,B_0,C5,B_0);
00641 EIGEN_ASM_COMMENT("myend");
00642 }
00643 else
00644 {
00645 EIGEN_ASM_COMMENT("mybegin4");
00646 LhsPacket A0, A1;
00647 RhsPacket B_0, B1, B2, B3;
00648 RhsPacket T0;
00649
00650 traits.loadLhs(&blA[0*LhsProgress], A0);
00651 traits.loadLhs(&blA[1*LhsProgress], A1);
00652 traits.loadRhs(&blB[0*RhsProgress], B_0);
00653 traits.loadRhs(&blB[1*RhsProgress], B1);
00654
00655 traits.madd(A0,B_0,C0,T0);
00656 traits.loadRhs(&blB[2*RhsProgress], B2);
00657 traits.madd(A1,B_0,C4,B_0);
00658 traits.loadRhs(&blB[3*RhsProgress], B3);
00659 traits.loadRhs(&blB[4*RhsProgress], B_0);
00660 traits.madd(A0,B1,C1,T0);
00661 traits.madd(A1,B1,C5,B1);
00662 traits.loadRhs(&blB[5*RhsProgress], B1);
00663 traits.madd(A0,B2,C2,T0);
00664 traits.madd(A1,B2,C6,B2);
00665 traits.loadRhs(&blB[6*RhsProgress], B2);
00666 traits.madd(A0,B3,C3,T0);
00667 traits.loadLhs(&blA[2*LhsProgress], A0);
00668 traits.madd(A1,B3,C7,B3);
00669 traits.loadLhs(&blA[3*LhsProgress], A1);
00670 traits.loadRhs(&blB[7*RhsProgress], B3);
00671 traits.madd(A0,B_0,C0,T0);
00672 traits.madd(A1,B_0,C4,B_0);
00673 traits.loadRhs(&blB[8*RhsProgress], B_0);
00674 traits.madd(A0,B1,C1,T0);
00675 traits.madd(A1,B1,C5,B1);
00676 traits.loadRhs(&blB[9*RhsProgress], B1);
00677 traits.madd(A0,B2,C2,T0);
00678 traits.madd(A1,B2,C6,B2);
00679 traits.loadRhs(&blB[10*RhsProgress], B2);
00680 traits.madd(A0,B3,C3,T0);
00681 traits.loadLhs(&blA[4*LhsProgress], A0);
00682 traits.madd(A1,B3,C7,B3);
00683 traits.loadLhs(&blA[5*LhsProgress], A1);
00684 traits.loadRhs(&blB[11*RhsProgress], B3);
00685
00686 traits.madd(A0,B_0,C0,T0);
00687 traits.madd(A1,B_0,C4,B_0);
00688 traits.loadRhs(&blB[12*RhsProgress], B_0);
00689 traits.madd(A0,B1,C1,T0);
00690 traits.madd(A1,B1,C5,B1);
00691 traits.loadRhs(&blB[13*RhsProgress], B1);
00692 traits.madd(A0,B2,C2,T0);
00693 traits.madd(A1,B2,C6,B2);
00694 traits.loadRhs(&blB[14*RhsProgress], B2);
00695 traits.madd(A0,B3,C3,T0);
00696 traits.loadLhs(&blA[6*LhsProgress], A0);
00697 traits.madd(A1,B3,C7,B3);
00698 traits.loadLhs(&blA[7*LhsProgress], A1);
00699 traits.loadRhs(&blB[15*RhsProgress], B3);
00700 traits.madd(A0,B_0,C0,T0);
00701 traits.madd(A1,B_0,C4,B_0);
00702 traits.madd(A0,B1,C1,T0);
00703 traits.madd(A1,B1,C5,B1);
00704 traits.madd(A0,B2,C2,T0);
00705 traits.madd(A1,B2,C6,B2);
00706 traits.madd(A0,B3,C3,T0);
00707 traits.madd(A1,B3,C7,B3);
00708 }
00709
00710 blB += 4*nr*RhsProgress;
00711 blA += 4*mr;
00712 }
00713
00714 for(Index k=peeled_kc; k<depth; k++)
00715 {
00716 if(nr==2)
00717 {
00718 LhsPacket A0, A1;
00719 RhsPacket B_0;
00720 RhsPacket T0;
00721
00722 traits.loadLhs(&blA[0*LhsProgress], A0);
00723 traits.loadLhs(&blA[1*LhsProgress], A1);
00724 traits.loadRhs(&blB[0*RhsProgress], B_0);
00725 traits.madd(A0,B_0,C0,T0);
00726 traits.madd(A1,B_0,C4,B_0);
00727 traits.loadRhs(&blB[1*RhsProgress], B_0);
00728 traits.madd(A0,B_0,C1,T0);
00729 traits.madd(A1,B_0,C5,B_0);
00730 }
00731 else
00732 {
00733 LhsPacket A0, A1;
00734 RhsPacket B_0, B1, B2, B3;
00735 RhsPacket T0;
00736
00737 traits.loadLhs(&blA[0*LhsProgress], A0);
00738 traits.loadLhs(&blA[1*LhsProgress], A1);
00739 traits.loadRhs(&blB[0*RhsProgress], B_0);
00740 traits.loadRhs(&blB[1*RhsProgress], B1);
00741
00742 traits.madd(A0,B_0,C0,T0);
00743 traits.loadRhs(&blB[2*RhsProgress], B2);
00744 traits.madd(A1,B_0,C4,B_0);
00745 traits.loadRhs(&blB[3*RhsProgress], B3);
00746 traits.madd(A0,B1,C1,T0);
00747 traits.madd(A1,B1,C5,B1);
00748 traits.madd(A0,B2,C2,T0);
00749 traits.madd(A1,B2,C6,B2);
00750 traits.madd(A0,B3,C3,T0);
00751 traits.madd(A1,B3,C7,B3);
00752 }
00753
00754 blB += nr*RhsProgress;
00755 blA += mr;
00756 }
00757
00758 if(nr==4)
00759 {
00760 ResPacket R0, R1, R2, R3, R4, R5, R6;
00761 ResPacket alphav = pset1<ResPacket>(alpha);
00762
00763 R0 = ploadu<ResPacket>(r0);
00764 R1 = ploadu<ResPacket>(r1);
00765 R2 = ploadu<ResPacket>(r2);
00766 R3 = ploadu<ResPacket>(r3);
00767 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00768 R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00769 R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00770 traits.acc(C0, alphav, R0);
00771 pstoreu(r0, R0);
00772 R0 = ploadu<ResPacket>(r3 + ResPacketSize);
00773
00774 traits.acc(C1, alphav, R1);
00775 traits.acc(C2, alphav, R2);
00776 traits.acc(C3, alphav, R3);
00777 traits.acc(C4, alphav, R4);
00778 traits.acc(C5, alphav, R5);
00779 traits.acc(C6, alphav, R6);
00780 traits.acc(C7, alphav, R0);
00781
00782 pstoreu(r1, R1);
00783 pstoreu(r2, R2);
00784 pstoreu(r3, R3);
00785 pstoreu(r0 + ResPacketSize, R4);
00786 pstoreu(r1 + ResPacketSize, R5);
00787 pstoreu(r2 + ResPacketSize, R6);
00788 pstoreu(r3 + ResPacketSize, R0);
00789 }
00790 else
00791 {
00792 ResPacket R0, R1, R4;
00793 ResPacket alphav = pset1<ResPacket>(alpha);
00794
00795 R0 = ploadu<ResPacket>(r0);
00796 R1 = ploadu<ResPacket>(r1);
00797 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00798 traits.acc(C0, alphav, R0);
00799 pstoreu(r0, R0);
00800 R0 = ploadu<ResPacket>(r1 + ResPacketSize);
00801 traits.acc(C1, alphav, R1);
00802 traits.acc(C4, alphav, R4);
00803 traits.acc(C5, alphav, R0);
00804 pstoreu(r1, R1);
00805 pstoreu(r0 + ResPacketSize, R4);
00806 pstoreu(r1 + ResPacketSize, R0);
00807 }
00808
00809 }
00810
00811 if(rows-peeled_mc>=LhsProgress)
00812 {
00813 Index i = peeled_mc;
00814 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00815 prefetch(&blA[0]);
00816
00817
00818 AccPacket C0, C1, C2, C3;
00819 traits.initAcc(C0);
00820 traits.initAcc(C1);
00821 if(nr==4) traits.initAcc(C2);
00822 if(nr==4) traits.initAcc(C3);
00823
00824
00825 const RhsScalar* blB = unpackedB;
00826 for(Index k=0; k<peeled_kc; k+=4)
00827 {
00828 if(nr==2)
00829 {
00830 LhsPacket A0;
00831 RhsPacket B_0, B1;
00832
00833 traits.loadLhs(&blA[0*LhsProgress], A0);
00834 traits.loadRhs(&blB[0*RhsProgress], B_0);
00835 traits.loadRhs(&blB[1*RhsProgress], B1);
00836 traits.madd(A0,B_0,C0,B_0);
00837 traits.loadRhs(&blB[2*RhsProgress], B_0);
00838 traits.madd(A0,B1,C1,B1);
00839 traits.loadLhs(&blA[1*LhsProgress], A0);
00840 traits.loadRhs(&blB[3*RhsProgress], B1);
00841 traits.madd(A0,B_0,C0,B_0);
00842 traits.loadRhs(&blB[4*RhsProgress], B_0);
00843 traits.madd(A0,B1,C1,B1);
00844 traits.loadLhs(&blA[2*LhsProgress], A0);
00845 traits.loadRhs(&blB[5*RhsProgress], B1);
00846 traits.madd(A0,B_0,C0,B_0);
00847 traits.loadRhs(&blB[6*RhsProgress], B_0);
00848 traits.madd(A0,B1,C1,B1);
00849 traits.loadLhs(&blA[3*LhsProgress], A0);
00850 traits.loadRhs(&blB[7*RhsProgress], B1);
00851 traits.madd(A0,B_0,C0,B_0);
00852 traits.madd(A0,B1,C1,B1);
00853 }
00854 else
00855 {
00856 LhsPacket A0;
00857 RhsPacket B_0, B1, B2, B3;
00858
00859 traits.loadLhs(&blA[0*LhsProgress], A0);
00860 traits.loadRhs(&blB[0*RhsProgress], B_0);
00861 traits.loadRhs(&blB[1*RhsProgress], B1);
00862
00863 traits.madd(A0,B_0,C0,B_0);
00864 traits.loadRhs(&blB[2*RhsProgress], B2);
00865 traits.loadRhs(&blB[3*RhsProgress], B3);
00866 traits.loadRhs(&blB[4*RhsProgress], B_0);
00867 traits.madd(A0,B1,C1,B1);
00868 traits.loadRhs(&blB[5*RhsProgress], B1);
00869 traits.madd(A0,B2,C2,B2);
00870 traits.loadRhs(&blB[6*RhsProgress], B2);
00871 traits.madd(A0,B3,C3,B3);
00872 traits.loadLhs(&blA[1*LhsProgress], A0);
00873 traits.loadRhs(&blB[7*RhsProgress], B3);
00874 traits.madd(A0,B_0,C0,B_0);
00875 traits.loadRhs(&blB[8*RhsProgress], B_0);
00876 traits.madd(A0,B1,C1,B1);
00877 traits.loadRhs(&blB[9*RhsProgress], B1);
00878 traits.madd(A0,B2,C2,B2);
00879 traits.loadRhs(&blB[10*RhsProgress], B2);
00880 traits.madd(A0,B3,C3,B3);
00881 traits.loadLhs(&blA[2*LhsProgress], A0);
00882 traits.loadRhs(&blB[11*RhsProgress], B3);
00883
00884 traits.madd(A0,B_0,C0,B_0);
00885 traits.loadRhs(&blB[12*RhsProgress], B_0);
00886 traits.madd(A0,B1,C1,B1);
00887 traits.loadRhs(&blB[13*RhsProgress], B1);
00888 traits.madd(A0,B2,C2,B2);
00889 traits.loadRhs(&blB[14*RhsProgress], B2);
00890 traits.madd(A0,B3,C3,B3);
00891
00892 traits.loadLhs(&blA[3*LhsProgress], A0);
00893 traits.loadRhs(&blB[15*RhsProgress], B3);
00894 traits.madd(A0,B_0,C0,B_0);
00895 traits.madd(A0,B1,C1,B1);
00896 traits.madd(A0,B2,C2,B2);
00897 traits.madd(A0,B3,C3,B3);
00898 }
00899
00900 blB += nr*4*RhsProgress;
00901 blA += 4*LhsProgress;
00902 }
00903
00904 for(Index k=peeled_kc; k<depth; k++)
00905 {
00906 if(nr==2)
00907 {
00908 LhsPacket A0;
00909 RhsPacket B_0, B1;
00910
00911 traits.loadLhs(&blA[0*LhsProgress], A0);
00912 traits.loadRhs(&blB[0*RhsProgress], B_0);
00913 traits.loadRhs(&blB[1*RhsProgress], B1);
00914 traits.madd(A0,B_0,C0,B_0);
00915 traits.madd(A0,B1,C1,B1);
00916 }
00917 else
00918 {
00919 LhsPacket A0;
00920 RhsPacket B_0, B1, B2, B3;
00921
00922 traits.loadLhs(&blA[0*LhsProgress], A0);
00923 traits.loadRhs(&blB[0*RhsProgress], B_0);
00924 traits.loadRhs(&blB[1*RhsProgress], B1);
00925 traits.loadRhs(&blB[2*RhsProgress], B2);
00926 traits.loadRhs(&blB[3*RhsProgress], B3);
00927
00928 traits.madd(A0,B_0,C0,B_0);
00929 traits.madd(A0,B1,C1,B1);
00930 traits.madd(A0,B2,C2,B2);
00931 traits.madd(A0,B3,C3,B3);
00932 }
00933
00934 blB += nr*RhsProgress;
00935 blA += LhsProgress;
00936 }
00937
00938 ResPacket R0, R1, R2, R3;
00939 ResPacket alphav = pset1<ResPacket>(alpha);
00940
00941 ResScalar* r0 = &res[(j2+0)*resStride + i];
00942 ResScalar* r1 = r0 + resStride;
00943 ResScalar* r2 = r1 + resStride;
00944 ResScalar* r3 = r2 + resStride;
00945
00946 R0 = ploadu<ResPacket>(r0);
00947 R1 = ploadu<ResPacket>(r1);
00948 if(nr==4) R2 = ploadu<ResPacket>(r2);
00949 if(nr==4) R3 = ploadu<ResPacket>(r3);
00950
00951 traits.acc(C0, alphav, R0);
00952 traits.acc(C1, alphav, R1);
00953 if(nr==4) traits.acc(C2, alphav, R2);
00954 if(nr==4) traits.acc(C3, alphav, R3);
00955
00956 pstoreu(r0, R0);
00957 pstoreu(r1, R1);
00958 if(nr==4) pstoreu(r2, R2);
00959 if(nr==4) pstoreu(r3, R3);
00960 }
00961 for(Index i=peeled_mc2; i<rows; i++)
00962 {
00963 const LhsScalar* blA = &blockA[i*strideA+offsetA];
00964 prefetch(&blA[0]);
00965
00966
00967 ResScalar C0(0), C1(0), C2(0), C3(0);
00968
00969 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00970 for(Index k=0; k<depth; k++)
00971 {
00972 if(nr==2)
00973 {
00974 LhsScalar A0;
00975 RhsScalar B_0, B1;
00976
00977 A0 = blA[k];
00978 B_0 = blB[0];
00979 B1 = blB[1];
00980 MADD(cj,A0,B_0,C0,B_0);
00981 MADD(cj,A0,B1,C1,B1);
00982 }
00983 else
00984 {
00985 LhsScalar A0;
00986 RhsScalar B_0, B1, B2, B3;
00987
00988 A0 = blA[k];
00989 B_0 = blB[0];
00990 B1 = blB[1];
00991 B2 = blB[2];
00992 B3 = blB[3];
00993
00994 MADD(cj,A0,B_0,C0,B_0);
00995 MADD(cj,A0,B1,C1,B1);
00996 MADD(cj,A0,B2,C2,B2);
00997 MADD(cj,A0,B3,C3,B3);
00998 }
00999
01000 blB += nr;
01001 }
01002 res[(j2+0)*resStride + i] += alpha*C0;
01003 res[(j2+1)*resStride + i] += alpha*C1;
01004 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
01005 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
01006 }
01007 }
01008
01009
01010 for(Index j2=packet_cols; j2<cols; j2++)
01011 {
01012
01013 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
01014
01015 for(Index i=0; i<peeled_mc; i+=mr)
01016 {
01017 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01018 prefetch(&blA[0]);
01019
01020
01021
01022
01023 AccPacket C0, C4;
01024 traits.initAcc(C0);
01025 traits.initAcc(C4);
01026
01027 const RhsScalar* blB = unpackedB;
01028 for(Index k=0; k<depth; k++)
01029 {
01030 LhsPacket A0, A1;
01031 RhsPacket B_0;
01032 RhsPacket T0;
01033
01034 traits.loadLhs(&blA[0*LhsProgress], A0);
01035 traits.loadLhs(&blA[1*LhsProgress], A1);
01036 traits.loadRhs(&blB[0*RhsProgress], B_0);
01037 traits.madd(A0,B_0,C0,T0);
01038 traits.madd(A1,B_0,C4,B_0);
01039
01040 blB += RhsProgress;
01041 blA += 2*LhsProgress;
01042 }
01043 ResPacket R0, R4;
01044 ResPacket alphav = pset1<ResPacket>(alpha);
01045
01046 ResScalar* r0 = &res[(j2+0)*resStride + i];
01047
01048 R0 = ploadu<ResPacket>(r0);
01049 R4 = ploadu<ResPacket>(r0+ResPacketSize);
01050
01051 traits.acc(C0, alphav, R0);
01052 traits.acc(C4, alphav, R4);
01053
01054 pstoreu(r0, R0);
01055 pstoreu(r0+ResPacketSize, R4);
01056 }
01057 if(rows-peeled_mc>=LhsProgress)
01058 {
01059 Index i = peeled_mc;
01060 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01061 prefetch(&blA[0]);
01062
01063 AccPacket C0;
01064 traits.initAcc(C0);
01065
01066 const RhsScalar* blB = unpackedB;
01067 for(Index k=0; k<depth; k++)
01068 {
01069 LhsPacket A0;
01070 RhsPacket B_0;
01071 traits.loadLhs(blA, A0);
01072 traits.loadRhs(blB, B_0);
01073 traits.madd(A0, B_0, C0, B_0);
01074 blB += RhsProgress;
01075 blA += LhsProgress;
01076 }
01077
01078 ResPacket alphav = pset1<ResPacket>(alpha);
01079 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01080 traits.acc(C0, alphav, R0);
01081 pstoreu(&res[(j2+0)*resStride + i], R0);
01082 }
01083 for(Index i=peeled_mc2; i<rows; i++)
01084 {
01085 const LhsScalar* blA = &blockA[i*strideA+offsetA];
01086 prefetch(&blA[0]);
01087
01088
01089 ResScalar C0(0);
01090
01091 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01092 for(Index k=0; k<depth; k++)
01093 {
01094 LhsScalar A0 = blA[k];
01095 RhsScalar B_0 = blB[k];
01096 MADD(cj, A0, B_0, C0, B_0);
01097 }
01098 res[(j2+0)*resStride + i] += alpha*C0;
01099 }
01100 }
01101 }
01102 };
01103
01104 #undef CJMADD
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01121 struct gemm_pack_lhs
01122 {
01123 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01124 Index stride=0, Index offset=0)
01125 {
01126 typedef typename packet_traits<Scalar>::type Packet;
01127 enum { PacketSize = packet_traits<Scalar>::size };
01128
01129 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
01130 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01131 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
01132 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01133 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01134 Index count = 0;
01135 Index peeled_mc = (rows/Pack1)*Pack1;
01136 for(Index i=0; i<peeled_mc; i+=Pack1)
01137 {
01138 if(PanelMode) count += Pack1 * offset;
01139
01140 if(StorageOrder==ColMajor)
01141 {
01142 for(Index k=0; k<depth; k++)
01143 {
01144 Packet A, B, C, D;
01145 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
01146 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
01147 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
01148 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
01149 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
01150 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
01151 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
01152 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
01153 }
01154 }
01155 else
01156 {
01157 for(Index k=0; k<depth; k++)
01158 {
01159
01160 Index w=0;
01161 for(; w<Pack1-3; w+=4)
01162 {
01163 Scalar a(cj(lhs(i+w+0, k))),
01164 b(cj(lhs(i+w+1, k))),
01165 c(cj(lhs(i+w+2, k))),
01166 d(cj(lhs(i+w+3, k)));
01167 blockA[count++] = a;
01168 blockA[count++] = b;
01169 blockA[count++] = c;
01170 blockA[count++] = d;
01171 }
01172 if(Pack1%4)
01173 for(;w<Pack1;++w)
01174 blockA[count++] = cj(lhs(i+w, k));
01175 }
01176 }
01177 if(PanelMode) count += Pack1 * (stride-offset-depth);
01178 }
01179 if(rows-peeled_mc>=Pack2)
01180 {
01181 if(PanelMode) count += Pack2*offset;
01182 for(Index k=0; k<depth; k++)
01183 for(Index w=0; w<Pack2; w++)
01184 blockA[count++] = cj(lhs(peeled_mc+w, k));
01185 if(PanelMode) count += Pack2 * (stride-offset-depth);
01186 peeled_mc += Pack2;
01187 }
01188 for(Index i=peeled_mc; i<rows; i++)
01189 {
01190 if(PanelMode) count += offset;
01191 for(Index k=0; k<depth; k++)
01192 blockA[count++] = cj(lhs(i, k));
01193 if(PanelMode) count += (stride-offset-depth);
01194 }
01195 }
01196 };
01197
01198
01199
01200
01201
01202
01203
01204
01205 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01206 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01207 {
01208 typedef typename packet_traits<Scalar>::type Packet;
01209 enum { PacketSize = packet_traits<Scalar>::size };
01210 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01211 Index stride=0, Index offset=0)
01212 {
01213 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
01214 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01215 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01216 Index packet_cols = (cols/nr) * nr;
01217 Index count = 0;
01218 for(Index j2=0; j2<packet_cols; j2+=nr)
01219 {
01220
01221 if(PanelMode) count += nr * offset;
01222 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01223 const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01224 const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01225 const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01226 for(Index k=0; k<depth; k++)
01227 {
01228 blockB[count+0] = cj(b0[k]);
01229 blockB[count+1] = cj(b1[k]);
01230 if(nr==4) blockB[count+2] = cj(b2[k]);
01231 if(nr==4) blockB[count+3] = cj(b3[k]);
01232 count += nr;
01233 }
01234
01235 if(PanelMode) count += nr * (stride-offset-depth);
01236 }
01237
01238
01239 for(Index j2=packet_cols; j2<cols; ++j2)
01240 {
01241 if(PanelMode) count += offset;
01242 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01243 for(Index k=0; k<depth; k++)
01244 {
01245 blockB[count] = cj(b0[k]);
01246 count += 1;
01247 }
01248 if(PanelMode) count += (stride-offset-depth);
01249 }
01250 }
01251 };
01252
01253
01254 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01255 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01256 {
01257 enum { PacketSize = packet_traits<Scalar>::size };
01258 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01259 Index stride=0, Index offset=0)
01260 {
01261 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
01262 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01263 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01264 Index packet_cols = (cols/nr) * nr;
01265 Index count = 0;
01266 for(Index j2=0; j2<packet_cols; j2+=nr)
01267 {
01268
01269 if(PanelMode) count += nr * offset;
01270 for(Index k=0; k<depth; k++)
01271 {
01272 const Scalar* b0 = &rhs[k*rhsStride + j2];
01273 blockB[count+0] = cj(b0[0]);
01274 blockB[count+1] = cj(b0[1]);
01275 if(nr==4) blockB[count+2] = cj(b0[2]);
01276 if(nr==4) blockB[count+3] = cj(b0[3]);
01277 count += nr;
01278 }
01279
01280 if(PanelMode) count += nr * (stride-offset-depth);
01281 }
01282
01283 for(Index j2=packet_cols; j2<cols; ++j2)
01284 {
01285 if(PanelMode) count += offset;
01286 const Scalar* b0 = &rhs[j2];
01287 for(Index k=0; k<depth; k++)
01288 {
01289 blockB[count] = cj(b0[k*rhsStride]);
01290 count += 1;
01291 }
01292 if(PanelMode) count += stride-offset-depth;
01293 }
01294 }
01295 };
01296
01297 }
01298
01301 inline std::ptrdiff_t l1CacheSize()
01302 {
01303 std::ptrdiff_t l1, l2;
01304 internal::manage_caching_sizes(GetAction, &l1, &l2);
01305 return l1;
01306 }
01307
01310 inline std::ptrdiff_t l2CacheSize()
01311 {
01312 std::ptrdiff_t l1, l2;
01313 internal::manage_caching_sizes(GetAction, &l1, &l2);
01314 return l2;
01315 }
01316
01322 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01323 {
01324 internal::manage_caching_sizes(SetAction, &l1, &l2);
01325 }
01326
01327 }
01328
01329 #endif // EIGEN_GENERAL_BLOCK_PANEL_H