00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00026 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00027
00028 namespace Eigen {
00029
00030 namespace internal {
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00046 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00047 {
00048 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00049
00050 enum {
00051 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00052 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00053 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00054 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00055 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00056 };
00057
00058 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00059 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00060 typedef typename packet_traits<ResScalar>::type _ResPacket;
00061
00062 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00063 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00064 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00065
00066 EIGEN_DONT_INLINE static void run(
00067 Index rows, Index cols,
00068 const LhsScalar* lhs, Index lhsStride,
00069 const RhsScalar* rhs, Index rhsIncr,
00070 ResScalar* res, Index
00071 #ifdef EIGEN_INTERNAL_DEBUGGING
00072 resIncr
00073 #endif
00074 , RhsScalar alpha)
00075 {
00076 eigen_internal_assert(resIncr==1);
00077 #ifdef _EIGEN_ACCUMULATE_PACKETS
00078 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00079 #endif
00080 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00081 pstore(&res[j], \
00082 padd(pload<ResPacket>(&res[j]), \
00083 padd( \
00084 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
00085 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
00086 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
00087 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
00088
00089 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00090 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00091 if(ConjugateRhs)
00092 alpha = conj(alpha);
00093
00094 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00095 const Index columnsAtOnce = 4;
00096 const Index peels = 2;
00097 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00098 const Index ResPacketAlignedMask = ResPacketSize-1;
00099 const Index PeelAlignedMask = ResPacketSize*peels-1;
00100 const Index size = rows;
00101
00102
00103
00104 Index alignedStart = internal::first_aligned(res,size);
00105 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00106 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00107
00108 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00109 Index alignmentPattern = alignmentStep==0 ? AllAligned
00110 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00111 : FirstAligned;
00112
00113
00114 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00115
00116
00117 Index skipColumns = 0;
00118
00119 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00120 {
00121 alignedSize = 0;
00122 alignedStart = 0;
00123 }
00124 else if (LhsPacketSize>1)
00125 {
00126 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00127
00128 while (skipColumns<LhsPacketSize &&
00129 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00130 ++skipColumns;
00131 if (skipColumns==LhsPacketSize)
00132 {
00133
00134 alignmentPattern = NoneAligned;
00135 skipColumns = 0;
00136 }
00137 else
00138 {
00139 skipColumns = (std::min)(skipColumns,cols);
00140
00141 }
00142
00143 eigen_internal_assert( (alignmentPattern==NoneAligned)
00144 || (skipColumns + columnsAtOnce >= cols)
00145 || LhsPacketSize > size
00146 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00147 }
00148 else if(Vectorizable)
00149 {
00150 alignedStart = 0;
00151 alignedSize = size;
00152 alignmentPattern = AllAligned;
00153 }
00154
00155 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00156 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00157
00158 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00159 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00160 {
00161 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00162 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00163 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00164 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00165
00166
00167 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00168 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00169
00170 if (Vectorizable)
00171 {
00172
00173
00174 for (Index j=0; j<alignedStart; ++j)
00175 {
00176 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00177 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00178 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00179 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00180 }
00181
00182 if (alignedSize>alignedStart)
00183 {
00184 switch(alignmentPattern)
00185 {
00186 case AllAligned:
00187 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00188 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00189 break;
00190 case EvenAligned:
00191 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00192 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00193 break;
00194 case FirstAligned:
00195 if(peels>1)
00196 {
00197 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00198 ResPacket T0, T1;
00199
00200 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00201 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00202 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00203
00204 for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
00205 {
00206 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00207 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00208 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00209
00210 A00 = pload<LhsPacket>(&lhs0[j]);
00211 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00212 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00213 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00214
00215 T0 = pcj.pmadd(A01, ptmp1, T0);
00216 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00217 T0 = pcj.pmadd(A02, ptmp2, T0);
00218 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00219 T0 = pcj.pmadd(A03, ptmp3, T0);
00220 pstore(&res[j],T0);
00221 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00222 T1 = pcj.pmadd(A11, ptmp1, T1);
00223 T1 = pcj.pmadd(A12, ptmp2, T1);
00224 T1 = pcj.pmadd(A13, ptmp3, T1);
00225 pstore(&res[j+ResPacketSize],T1);
00226 }
00227 }
00228 for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
00229 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00230 break;
00231 default:
00232 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00233 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00234 break;
00235 }
00236 }
00237 }
00238
00239
00240 for (Index j=alignedSize; j<size; ++j)
00241 {
00242 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00243 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00244 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00245 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00246 }
00247 }
00248
00249
00250 Index end = cols;
00251 Index start = columnBound;
00252 do
00253 {
00254 for (Index k=start; k<end; ++k)
00255 {
00256 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00257 const LhsScalar* lhs0 = lhs + k*lhsStride;
00258
00259 if (Vectorizable)
00260 {
00261
00262
00263 for (Index j=0; j<alignedStart; ++j)
00264 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00265
00266 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00267 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00268 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00269 else
00270 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00271 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00272 }
00273
00274
00275 for (Index i=alignedSize; i<size; ++i)
00276 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00277 }
00278 if (skipColumns)
00279 {
00280 start = 0;
00281 end = skipColumns;
00282 skipColumns = 0;
00283 }
00284 else
00285 break;
00286 } while(Vectorizable);
00287 #undef _EIGEN_ACCUMULATE_PACKETS
00288 }
00289 };
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00302 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00303 {
00304 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00305
00306 enum {
00307 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00308 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00309 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00310 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00311 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00312 };
00313
00314 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00315 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00316 typedef typename packet_traits<ResScalar>::type _ResPacket;
00317
00318 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00319 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00320 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00321
00322 EIGEN_DONT_INLINE static void run(
00323 Index rows, Index cols,
00324 const LhsScalar* lhs, Index lhsStride,
00325 const RhsScalar* rhs, Index rhsIncr,
00326 ResScalar* res, Index resIncr,
00327 ResScalar alpha)
00328 {
00329 EIGEN_UNUSED_VARIABLE(rhsIncr);
00330 eigen_internal_assert(rhsIncr==1);
00331 #ifdef _EIGEN_ACCUMULATE_PACKETS
00332 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00333 #endif
00334
00335 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00336 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00337 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00338 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00339 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00340 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00341
00342 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00343 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00344
00345 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00346 const Index rowsAtOnce = 4;
00347 const Index peels = 2;
00348 const Index RhsPacketAlignedMask = RhsPacketSize-1;
00349 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00350 const Index PeelAlignedMask = RhsPacketSize*peels-1;
00351 const Index depth = cols;
00352
00353
00354
00355
00356 Index alignedStart = internal::first_aligned(rhs, depth);
00357 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00358 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00359
00360 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00361 Index alignmentPattern = alignmentStep==0 ? AllAligned
00362 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00363 : FirstAligned;
00364
00365
00366 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00367
00368
00369 Index skipRows = 0;
00370
00371 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00372 {
00373 alignedSize = 0;
00374 alignedStart = 0;
00375 }
00376 else if (LhsPacketSize>1)
00377 {
00378 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
00379
00380 while (skipRows<LhsPacketSize &&
00381 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00382 ++skipRows;
00383 if (skipRows==LhsPacketSize)
00384 {
00385
00386 alignmentPattern = NoneAligned;
00387 skipRows = 0;
00388 }
00389 else
00390 {
00391 skipRows = (std::min)(skipRows,Index(rows));
00392
00393 }
00394 eigen_internal_assert( alignmentPattern==NoneAligned
00395 || LhsPacketSize==1
00396 || (skipRows + rowsAtOnce >= rows)
00397 || LhsPacketSize > depth
00398 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00399 }
00400 else if(Vectorizable)
00401 {
00402 alignedStart = 0;
00403 alignedSize = depth;
00404 alignmentPattern = AllAligned;
00405 }
00406
00407 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00408 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00409
00410 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00411 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00412 {
00413 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00414 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00415
00416
00417 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00418 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00419
00420 if (Vectorizable)
00421 {
00422
00423 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00424 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00425
00426
00427
00428 for (Index j=0; j<alignedStart; ++j)
00429 {
00430 RhsScalar b = rhs[j];
00431 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00432 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00433 }
00434
00435 if (alignedSize>alignedStart)
00436 {
00437 switch(alignmentPattern)
00438 {
00439 case AllAligned:
00440 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00441 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00442 break;
00443 case EvenAligned:
00444 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00445 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00446 break;
00447 case FirstAligned:
00448 if (peels>1)
00449 {
00450
00451
00452
00453
00454
00455
00456 LhsPacket A01, A02, A03, A11, A12, A13;
00457 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00458 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00459 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00460
00461 for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
00462 {
00463 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00464 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00465 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00466 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00467
00468 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00469 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00470 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00471 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00472 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00473 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00474 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00475
00476 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00477 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00478 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00479 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00480 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00481 }
00482 }
00483 for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
00484 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00485 break;
00486 default:
00487 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00488 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00489 break;
00490 }
00491 tmp0 += predux(ptmp0);
00492 tmp1 += predux(ptmp1);
00493 tmp2 += predux(ptmp2);
00494 tmp3 += predux(ptmp3);
00495 }
00496 }
00497
00498
00499
00500 for (Index j=alignedSize; j<depth; ++j)
00501 {
00502 RhsScalar b = rhs[j];
00503 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00504 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00505 }
00506 res[i*resIncr] += alpha*tmp0;
00507 res[(i+offset1)*resIncr] += alpha*tmp1;
00508 res[(i+2)*resIncr] += alpha*tmp2;
00509 res[(i+offset3)*resIncr] += alpha*tmp3;
00510 }
00511
00512
00513 Index end = rows;
00514 Index start = rowBound;
00515 do
00516 {
00517 for (Index i=start; i<end; ++i)
00518 {
00519 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00520 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00521 const LhsScalar* lhs0 = lhs + i*lhsStride;
00522
00523
00524 for (Index j=0; j<alignedStart; ++j)
00525 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00526
00527 if (alignedSize>alignedStart)
00528 {
00529
00530 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00531 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00532 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00533 else
00534 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00535 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00536 tmp0 += predux(ptmp0);
00537 }
00538
00539
00540
00541 for (Index j=alignedSize; j<depth; ++j)
00542 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00543 res[i*resIncr] += alpha*tmp0;
00544 }
00545 if (skipRows)
00546 {
00547 start = 0;
00548 end = skipRows;
00549 skipRows = 0;
00550 }
00551 else
00552 break;
00553 } while(Vectorizable);
00554
00555 #undef _EIGEN_ACCUMULATE_PACKETS
00556 }
00557 };
00558
00559 }
00560
00561 }
00562
00563 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H