PacketMath.h
Go to the documentation of this file.
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
00005 //
00006 // Eigen is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU Lesser General Public
00008 // License as published by the Free Software Foundation; either
00009 // version 3 of the License, or (at your option) any later version.
00010 //
00011 // Alternatively, you can redistribute it and/or
00012 // modify it under the terms of the GNU General Public License as
00013 // published by the Free Software Foundation; either version 2 of
00014 // the License, or (at your option) any later version.
00015 //
00016 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
00017 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
00019 // GNU General Public License for more details.
00020 //
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License and a copy of the GNU General Public License along with
00023 // Eigen. If not, see <http://www.gnu.org/licenses/>.
00024 
00025 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00026 #define EIGEN_PACKET_MATH_ALTIVEC_H
00027 
00028 namespace Eigen {
00029 
00030 namespace internal {
00031 
00032 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00033 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00034 #endif
00035 
00036 #ifndef EIGEN_HAS_FUSE_CJMADD
00037 #define EIGEN_HAS_FUSE_CJMADD 1
00038 #endif
00039 
00040 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
00041 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00042 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
00043 #endif
00044 
00045 typedef __vector float          Packet4f;
00046 typedef __vector int            Packet4i;
00047 typedef __vector unsigned int   Packet4ui;
00048 typedef __vector __bool int     Packet4bi;
00049 typedef __vector short int      Packet8i;
00050 typedef __vector unsigned char  Packet16uc;
00051 
00052 // We don't want to write the same code all the time, but we need to reuse the constants
00053 // and it doesn't really work to declare them global, so we define macros instead
00054 
00055 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
00056   Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
00057 
00058 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
00059   Packet4i p4i_##NAME = vec_splat_s32(X)
00060 
00061 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00062   Packet4f p4f_##NAME = pset1<Packet4f>(X)
00063 
00064 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00065   Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
00066 
00067 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00068   Packet4i p4i_##NAME = pset1<Packet4i>(X)
00069 
00070 #define DST_CHAN 1
00071 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
00072 
00073 // Define global static constants:
00074 static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
00075 static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
00076 static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
00077 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
00078 static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
00079 
00080 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
00081 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
00082 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
00083 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
00084 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
00085 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
00086 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
00087 
00088 template<> struct packet_traits<float>  : default_packet_traits
00089 {
00090   typedef Packet4f type;
00091   enum {
00092     Vectorizable = 1,
00093     AlignedOnScalar = 1,
00094     size=4,
00095 
00096     // FIXME check the Has*
00097     HasSin  = 0,
00098     HasCos  = 0,
00099     HasLog  = 0,
00100     HasExp  = 0,
00101     HasSqrt = 0
00102   };
00103 };
00104 template<> struct packet_traits<int>    : default_packet_traits
00105 {
00106   typedef Packet4i type;
00107   enum {
00108     // FIXME check the Has*
00109     Vectorizable = 1,
00110     AlignedOnScalar = 1,
00111     size=4
00112   };
00113 };
00114 
00115 template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
00116 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
00117 /*
00118 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
00119 {
00120   union {
00121     Packet4f   v;
00122     float n[4];
00123   } vt;
00124   vt.v = v;
00125   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00126   return s;
00127 }
00128 
00129 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
00130 {
00131   union {
00132     Packet4i   v;
00133     int n[4];
00134   } vt;
00135   vt.v = v;
00136   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00137   return s;
00138 }
00139 
00140 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
00141 {
00142   union {
00143     Packet4ui   v;
00144     unsigned int n[4];
00145   } vt;
00146   vt.v = v;
00147   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00148   return s;
00149 }
00150 
00151 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
00152 {
00153   union {
00154     Packet4bi v;
00155     unsigned int n[4];
00156   } vt;
00157   vt.v = v;
00158   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00159   return s;
00160 }
00161 */
00162 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
00163   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00164   float EIGEN_ALIGN16 af[4];
00165   af[0] = from;
00166   Packet4f vc = vec_ld(0, af);
00167   vc = vec_splat(vc, 0);
00168   return vc;
00169 }
00170 
00171 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
00172   int EIGEN_ALIGN16 ai[4];
00173   ai[0] = from;
00174   Packet4i vc = vec_ld(0, ai);
00175   vc = vec_splat(vc, 0);
00176   return vc;
00177 }
00178 
00179 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
00180 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
00181 
00182 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
00183 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
00184 
00185 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
00186 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
00187 
00188 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
00189 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
00190 
00191 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
00192 /* Commented out: it's actually slower than processing it scalar
00193  *
00194 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
00195 {
00196   // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
00197   //Set up constants, variables
00198   Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
00199 
00200   // Get the absolute values
00201   a1  = vec_abs(a);
00202   b1  = vec_abs(b);
00203 
00204   // Get the signs using xor
00205   Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
00206 
00207   // Do the multiplication for the asbolute values.
00208   bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
00209   low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
00210   high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
00211   high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
00212   prod = vec_add( low_prod, high_prod );
00213 
00214   // NOR the product and select only the negative elements according to the sign mask
00215   prod_ = vec_nor(prod, prod);
00216   prod_ = vec_sel(p4i_ZERO, prod_, sgn);
00217 
00218   // Add 1 to the result to get the negative numbers
00219   v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
00220   prod_ = vec_add(prod_, v1sel);
00221 
00222   // Merge the results back to the final vector.
00223   prod = vec_sel(prod, prod_, sgn);
00224 
00225   return prod;
00226 }
00227 */
00228 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
00229 {
00230   Packet4f t, y_0, y_1, res;
00231 
00232   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
00233   y_0 = vec_re(b);
00234 
00235   // Do one Newton-Raphson iteration to get the needed accuracy
00236   t   = vec_nmsub(y_0, b, p4f_ONE);
00237   y_1 = vec_madd(y_0, t, y_0);
00238 
00239   res = vec_madd(a, y_1, p4f_ZERO);
00240   return res;
00241 }
00242 
00243 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
00244 { eigen_assert(false && "packet integer division are not supported by AltiVec");
00245   return pset1<Packet4i>(0);
00246 }
00247 
00248 // for some weird raisons, it has to be overloaded for packet of integers
00249 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
00250 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00251 
00252 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
00253 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
00254 
00255 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
00256 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
00257 
00258 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
00259 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
00260 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
00261 
00262 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
00263 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
00264 
00265 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
00266 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
00267 
00268 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
00269 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
00270 
00271 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00272 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00273 
00274 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00275 {
00276   EIGEN_DEBUG_ALIGNED_LOAD
00277   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00278   Packet16uc MSQ, LSQ;
00279   Packet16uc mask;
00280   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
00281   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
00282   mask = vec_lvsl(0, from);                        // create the permute mask
00283   return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
00284 
00285 }
00286 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00287 {
00288   EIGEN_DEBUG_ALIGNED_LOAD
00289   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00290   Packet16uc MSQ, LSQ;
00291   Packet16uc mask;
00292   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
00293   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
00294   mask = vec_lvsl(0, from);                        // create the permute mask
00295   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
00296 }
00297 
00298 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
00299 {
00300   Packet4f p;
00301   if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
00302   else                              p = ploadu<Packet4f>(from);
00303   return vec_perm(p, p, p16uc_DUPLICATE);
00304 }
00305 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
00306 {
00307   Packet4i p;
00308   if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
00309   else                              p = ploadu<Packet4i>(from);
00310   return vec_perm(p, p, p16uc_DUPLICATE);
00311 }
00312 
00313 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00314 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00315 
00316 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
00317 {
00318   EIGEN_DEBUG_UNALIGNED_STORE
00319   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00320   // Warning: not thread safe!
00321   Packet16uc MSQ, LSQ, edges;
00322   Packet16uc edgeAlign, align;
00323 
00324   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
00325   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
00326   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
00327   edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
00328   align = vec_lvsr( 0, to );                                // permute map to misalign data
00329   MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
00330   LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
00331   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
00332   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
00333 }
00334 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
00335 {
00336   EIGEN_DEBUG_UNALIGNED_STORE
00337   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00338   // Warning: not thread safe!
00339   Packet16uc MSQ, LSQ, edges;
00340   Packet16uc edgeAlign, align;
00341 
00342   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
00343   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
00344   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
00345   edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
00346   align = vec_lvsr( 0, to );                                // permute map to misalign data
00347   MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
00348   LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
00349   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
00350   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
00351 }
00352 
00353 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00354 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00355 
00356 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00357 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00358 
00359 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00360 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00361 
00362 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
00363 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
00364 
00365 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00366 {
00367   Packet4f b, sum;
00368   b   = (Packet4f) vec_sld(a, a, 8);
00369   sum = vec_add(a, b);
00370   b   = (Packet4f) vec_sld(sum, sum, 4);
00371   sum = vec_add(sum, b);
00372   return pfirst(sum);
00373 }
00374 
00375 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00376 {
00377   Packet4f v[4], sum[4];
00378 
00379   // It's easier and faster to transpose then add as columns
00380   // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
00381   // Do the transpose, first set of moves
00382   v[0] = vec_mergeh(vecs[0], vecs[2]);
00383   v[1] = vec_mergel(vecs[0], vecs[2]);
00384   v[2] = vec_mergeh(vecs[1], vecs[3]);
00385   v[3] = vec_mergel(vecs[1], vecs[3]);
00386   // Get the resulting vectors
00387   sum[0] = vec_mergeh(v[0], v[2]);
00388   sum[1] = vec_mergel(v[0], v[2]);
00389   sum[2] = vec_mergeh(v[1], v[3]);
00390   sum[3] = vec_mergel(v[1], v[3]);
00391 
00392   // Now do the summation:
00393   // Lines 0+1
00394   sum[0] = vec_add(sum[0], sum[1]);
00395   // Lines 2+3
00396   sum[1] = vec_add(sum[2], sum[3]);
00397   // Add the results
00398   sum[0] = vec_add(sum[0], sum[1]);
00399 
00400   return sum[0];
00401 }
00402 
00403 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00404 {
00405   Packet4i sum;
00406   sum = vec_sums(a, p4i_ZERO);
00407   sum = vec_sld(sum, p4i_ZERO, 12);
00408   return pfirst(sum);
00409 }
00410 
00411 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00412 {
00413   Packet4i v[4], sum[4];
00414 
00415   // It's easier and faster to transpose then add as columns
00416   // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
00417   // Do the transpose, first set of moves
00418   v[0] = vec_mergeh(vecs[0], vecs[2]);
00419   v[1] = vec_mergel(vecs[0], vecs[2]);
00420   v[2] = vec_mergeh(vecs[1], vecs[3]);
00421   v[3] = vec_mergel(vecs[1], vecs[3]);
00422   // Get the resulting vectors
00423   sum[0] = vec_mergeh(v[0], v[2]);
00424   sum[1] = vec_mergel(v[0], v[2]);
00425   sum[2] = vec_mergeh(v[1], v[3]);
00426   sum[3] = vec_mergel(v[1], v[3]);
00427 
00428   // Now do the summation:
00429   // Lines 0+1
00430   sum[0] = vec_add(sum[0], sum[1]);
00431   // Lines 2+3
00432   sum[1] = vec_add(sum[2], sum[3]);
00433   // Add the results
00434   sum[0] = vec_add(sum[0], sum[1]);
00435 
00436   return sum[0];
00437 }
00438 
00439 // Other reduction functions:
00440 // mul
00441 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00442 {
00443   Packet4f prod;
00444   prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
00445   return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
00446 }
00447 
00448 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00449 {
00450   EIGEN_ALIGN16 int aux[4];
00451   pstore(aux, a);
00452   return aux[0] * aux[1] * aux[2] * aux[3];
00453 }
00454 
00455 // min
00456 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00457 {
00458   Packet4f b, res;
00459   b = vec_min(a, vec_sld(a, a, 8));
00460   res = vec_min(b, vec_sld(b, b, 4));
00461   return pfirst(res);
00462 }
00463 
00464 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00465 {
00466   Packet4i b, res;
00467   b = vec_min(a, vec_sld(a, a, 8));
00468   res = vec_min(b, vec_sld(b, b, 4));
00469   return pfirst(res);
00470 }
00471 
00472 // max
00473 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00474 {
00475   Packet4f b, res;
00476   b = vec_max(a, vec_sld(a, a, 8));
00477   res = vec_max(b, vec_sld(b, b, 4));
00478   return pfirst(res);
00479 }
00480 
00481 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00482 {
00483   Packet4i b, res;
00484   b = vec_max(a, vec_sld(a, a, 8));
00485   res = vec_max(b, vec_sld(b, b, 4));
00486   return pfirst(res);
00487 }
00488 
00489 template<int Offset>
00490 struct palign_impl<Offset,Packet4f>
00491 {
00492   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00493   {
00494     if (Offset!=0)
00495       first = vec_sld(first, second, Offset*4);
00496   }
00497 };
00498 
00499 template<int Offset>
00500 struct palign_impl<Offset,Packet4i>
00501 {
00502   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00503   {
00504     if (Offset!=0)
00505       first = vec_sld(first, second, Offset*4);
00506   }
00507 };
00508 
00509 } // end namespace internal
00510 
00511 } // end namespace Eigen
00512 
00513 #endif // EIGEN_PACKET_MATH_ALTIVEC_H