HMSBEAGLE  1.0.0
libhmsbeagle/CPU/BeagleCPUSSEImpl.hpp
00001 /*
00002  *  BeagleCPUSSEImpl.hpp
00003  *  BEAGLE
00004  *
00005  * Copyright 2010 Phylogenetic Likelihood Working Group
00006  *
00007  * This file is part of BEAGLE.
00008  *
00009  * BEAGLE is free software: you can redistribute it and/or modify
00010  * it under the terms of the GNU Lesser General Public License as
00011  * published by the Free Software Foundation, either version 3 of
00012  * the License, or (at your option) any later version.
00013  *
00014  * BEAGLE is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with BEAGLE.  If not, see
00021  * <http://www.gnu.org/licenses/>.
00022  *
00023  * @author Marc Suchard
00024  * @author David Swofford
00025  */
00026 
00027 #ifndef BEAGLE_CPU_SSE_IMPL_HPP
00028 #define BEAGLE_CPU_SSE_IMPL_HPP
00029 
00030 
00031 #ifdef HAVE_CONFIG_H
00032 #include "libhmsbeagle/config.h"
00033 #endif
00034 
00035 #include <cstdio>
00036 #include <cstdlib>
00037 #include <iostream>
00038 #include <cstring>
00039 #include <cmath>
00040 #include <cassert>
00041 
00042 #include "libhmsbeagle/beagle.h"
00043 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
00044 #include "libhmsbeagle/CPU/BeagleCPUSSEImpl.h"
00045 #include "libhmsbeagle/CPU/SSEDefinitions.h"
00046 
00047 namespace beagle {
00048 namespace cpu {
00049 
00050 BEAGLE_CPU_FACTORY_TEMPLATE
00051 inline const char* getBeagleCPUSSEName(){ return "CPU-SSE-Unknown"; };
00052 
00053 template<>
00054 inline const char* getBeagleCPUSSEName<double>(){ return "CPU-SSE-Double"; };
00055 
00056 template<>
00057 inline const char* getBeagleCPUSSEName<float>(){ return "CPU-SSE-Single"; };
00058 
00059 /*
00060  * Calculates partial likelihoods at a node when both children have states.
00061  */
00062 
00063 BEAGLE_CPU_SSE_TEMPLATE
00064 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(double* destP,
00065                                      const int* states_q,
00066                                      const double* matrices_q,
00067                                      const int* states_r,
00068                                      const double* matrices_r) {
00069 
00070         BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(destP,
00071                                      states_q,
00072                                      matrices_q,
00073                                      states_r,
00074                                      matrices_r);
00075 }
00076 
00077 
00078 //template <>
00079 //void BeagleCPUSSEImpl<double>::calcStatesStates(double* destP,
00080 //                                     const int* states_q,
00081 //                                     const double* matrices_q,
00082 //                                     const int* states_r,
00083 //                                     const double* matrices_r) {
00084 //
00085 //      VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00086 //
00087 //    int w = 0;
00088 //      V_Real *destPvec = (V_Real *)destP;
00089 //
00090 //    for (int l = 0; l < kCategoryCount; l++) {
00091 //
00092 //      SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00093 //
00094 //        for (int k = 0; k < kPatternCount; k++) {
00095 //
00096 //            const int state_q = states_q[k];
00097 //            const int state_r = states_r[k];
00098 //
00099 //            *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx);
00100 //            *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx);
00101 //
00102 //        }
00103 //
00104 //        w += OFFSET*4;
00105 //        if (kExtraPatterns)
00106 //              destPvec += kExtraPatterns * 2;
00107 //    }
00108 //}
00109 
00110 /*
00111  * Calculates partial likelihoods at a node when one child has states and one has partials.
00112    SSE version
00113  */
00114 BEAGLE_CPU_SSE_TEMPLATE
00115 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(double* destP,
00116                                        const int* states_q,
00117                                        const double* matrices_q,
00118                                        const double* partials_r,
00119                                        const double* matrices_r) {
00120         BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(
00121                                                                            destP,
00122                                                                            states_q,
00123                                                                            matrices_q,
00124                                                                            partials_r,
00125                                                                            matrices_r);
00126 }
00127 
00128 //
00129 //
00130 //template <>
00131 //void BeagleCPUSSEImpl<double>::calcStatesPartials(double* destP,
00132 //                                       const int* states_q,
00133 //                                       const double* matrices_q,
00134 //                                       const double* partials_r,
00135 //                                       const double* matrices_r) {
00136 //
00137 //    int v = 0;
00138 //    int w = 0;
00139 //
00140 //      VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00141 //      V_Real *destPvec = (V_Real *)destP;
00142 //      V_Real destr_01, destr_23;
00143 //
00144 //    for (int l = 0; l < kCategoryCount; l++) {
00145 //
00146 //      SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00147 //
00148 //        for (int k = 0; k < kPatternCount; k++) {
00149 //
00150 //            const int state_q = states_q[k];
00151 //            V_Real vp0, vp1, vp2, vp3;
00152 //            SSE_PREFETCH_PARTIALS(vp,partials_r,v);
00153 //
00154 //                      destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
00155 //                      destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
00156 //                      destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
00157 //                      destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
00158 //                      destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
00159 //                      destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
00160 //                      destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
00161 //                      destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
00162 //
00163 //            *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01);
00164 //            *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23);
00165 //
00166 //            v += 4;
00167 //        }
00168 //        w += OFFSET*4;
00169 //        if (kExtraPatterns) {
00170 //              destPvec += kExtraPatterns * 2;
00171 //              v += kExtraPatterns * 4;
00172 //        }
00173 //    }
00174 //}
00175 
00176 //template<>
00177 //void inline BeagleCPUSSEImpl<double>::innerPartialsPartals(
00178 //              const double* __restrict partials1,
00179 //              const double* __restrict matrices1,
00180 //              const double* __restrict partials2,
00181 //              const double* __restrict matrices2,
00182 //              V_Real& sum1_vec,
00183 //              V_Real& sum2_vec,
00184 //              V_Real& out,
00185 //              int& v,
00186 //              int& w) {
00187 //      int j = 0;
00188 //      sum1_vec = VEC_SETZERO();
00189 //      sum2_vec = VEC_SETZERO();
00190 //      for (; j < kStateCountMinusOne; j += 2) {
00191 //              sum1_vec = VEC_MADD(
00192 //                              VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even
00193 //                              VEC_LOAD(partials1 + v + j), // TODO This only works if v is even
00194 //                              sum1_vec);
00195 //              sum2_vec = VEC_MADD(
00196 //                              VEC_LOAD(matrices2 + w + j),
00197 //                              VEC_LOAD(partials2 + v + j),
00198 //                              sum2_vec);
00199 //      }
00200 //
00201 //      out = VEC_MULT(
00202 //                      VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
00203 //                      VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
00204 //      );
00205 //}
00206 
00207 //#define DOUBLE_UNROLL // Does not appear to save any time
00208 
00209 BEAGLE_CPU_SSE_TEMPLATE
00210 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartials(double* __restrict destP,
00211                                               const double* __restrict partials1,
00212                                               const double* __restrict matrices1,
00213                                               const double* __restrict partials2,
00214                                               const double* __restrict matrices2) {
00215     int stateCountMinusOne = kPartialsPaddedStateCount - 1;
00216 #pragma omp parallel for num_threads(kCategoryCount)
00217     for (int l = 0; l < kCategoryCount; l++) {
00218         double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount;
00219         int v = l*kPartialsPaddedStateCount*kPatternCount;
00220         for (int k = 0; k < kPatternCount; k++) {
00221             int w = l * kMatrixSize;
00222             for (int i = 0; i < kStateCount;
00223 #ifdef DOUBLE_UNROLL
00224                         i += 2 // TODO This only works if stateCount is even
00225 #else
00226                         ++i
00227 #endif
00228             ) {
00229                 register V_Real sum1_vecA = VEC_SETZERO();
00230                 register V_Real sum2_vecA = VEC_SETZERO();
00231                 for (int j = 0; j < stateCountMinusOne; j += 2) {
00232                         sum1_vecA = VEC_MADD(
00233                                                                  VEC_LOAD(matrices1 + w + j),  // TODO This only works if w is even
00234                                                                  VEC_LOAD(partials1 + v + j),  // TODO This only works if v is even
00235                                                                  sum1_vecA);
00236                         sum2_vecA = VEC_MADD(
00237                                                                  VEC_LOAD(matrices2 + w + j),
00238                                                                  VEC_LOAD(partials2 + v + j),
00239                                                                  sum2_vecA);
00240                 }
00241 
00242                 sum1_vecA = VEC_MULT(
00243                                VEC_ADD(sum1_vecA, VEC_SWAP(sum1_vecA)),
00244                                VEC_ADD(sum2_vecA, VEC_SWAP(sum2_vecA))
00245                            );
00246 
00247                 // increment for the extra column at the end
00248                 w += kStateCount + T_PAD;
00249 
00250 #ifndef DOUBLE_UNROLL
00251                 // Store single value
00252                 VEC_STORE_SCALAR(destPu, sum1_vecA);
00253                 destPu++;
00254 #endif
00255 
00256 #ifdef DOUBLE_UNROLL
00257                 register V_Real sum1_vecB = VEC_SETZERO();
00258                 register V_Real sum2_vecB = VEC_SETZERO();
00259                 for (int j = 0; j < stateCountMinusOne; j += 2) {
00260                         sum1_vecB = VEC_MADD(
00261                                                                  VEC_LOAD(matrices1 + w + j),  // TODO This only works if w is even
00262                                                                  VEC_LOAD(partials1 + v + j),  // TODO This only works if v is even
00263                                                                  sum1_vecB);
00264                         sum2_vecB = VEC_MADD(
00265                                                                  VEC_LOAD(matrices2 + w + j),
00266                                                                  VEC_LOAD(partials2 + v + j),
00267                                                                  sum2_vecB);
00268                 }
00269 
00270                 sum1_vecB = VEC_MULT(
00271                                VEC_ADD(sum1_vecB, VEC_SWAP(sum1_vecB)),
00272                                VEC_ADD(sum2_vecB, VEC_SWAP(sum2_vecB))
00273                            );
00274 
00275                 // increment for the extra column at the end
00276                 w += kStateCount + T_PAD;
00277 
00278                 // Store both partials in one transaction
00279                 VEC_STORE(destPu, VEC_MOVE(sum1_vecA, sum1_vecB));
00280                 destPu += 2;
00281 #endif
00282 
00283             }
00284             destPu += P_PAD;
00285             v += kPartialsPaddedStateCount;
00286         }
00287     }
00288 }
00289 
00290 BEAGLE_CPU_SSE_TEMPLATE
00291 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(
00292                                                                                                         double* __restrict destP,
00293                                               const double* __restrict partials1,
00294                                               const double* __restrict matrices1,
00295                                               const double* __restrict partials2,
00296                                               const double* __restrict matrices2,
00297                                               const double* __restrict scaleFactors) {
00298     int stateCountMinusOne = kPartialsPaddedStateCount - 1;
00299 #pragma omp parallel for num_threads(kCategoryCount)
00300     for (int l = 0; l < kCategoryCount; l++) {
00301         double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount;
00302         int v = l*kPartialsPaddedStateCount*kPatternCount;
00303         for (int k = 0; k < kPatternCount; k++) {
00304             int w = l * kMatrixSize;
00305             const V_Real scalar = VEC_SPLAT(scaleFactors[k]);
00306             for (int i = 0; i < kStateCount; i++) {
00307 
00308                 register V_Real sum1_vec;
00309                 register V_Real sum2_vec;
00310 
00311                 int j = 0;
00312                 sum1_vec = VEC_SETZERO();
00313                 sum2_vec = VEC_SETZERO();
00314                 for ( ; j < stateCountMinusOne; j += 2) {
00315                         sum1_vec = VEC_MADD(
00316                                                                  VEC_LOAD(matrices1 + w + j),  // TODO This only works if w is even
00317                                                                  VEC_LOAD(partials1 + v + j),  // TODO This only works if v is even
00318                                                                  sum1_vec);
00319                         sum2_vec = VEC_MADD(
00320                                                                  VEC_LOAD(matrices2 + w + j),
00321                                                                  VEC_LOAD(partials2 + v + j),
00322                                                                  sum2_vec);
00323                 }
00324                 VEC_STORE_SCALAR(destPu,
00325                                 VEC_DIV(VEC_MULT(
00326                                                 VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
00327                                                 VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
00328                                 ), scalar));
00329 
00330 
00331                 // increment for the extra column at the end
00332                 w += kStateCount + T_PAD;
00333 
00334                 destPu++;
00335             }
00336             destPu += P_PAD;
00337             v += kPartialsPaddedStateCount;
00338         }
00339     }
00340 }
00341 
00342     
00343 BEAGLE_CPU_SSE_TEMPLATE
00344 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP,
00345                                                          const double*  partials_q,
00346                                                          const double*  matrices_q,
00347                                                          const double*  partials_r,
00348                                                          const double*  matrices_r,
00349                                                                   int* activateScaling) {
00350     BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
00351                                                      partials_q,
00352                                                      matrices_q,
00353                                                      partials_r,
00354                                                      matrices_r,
00355                                                      activateScaling);
00356 }
00357 
00358 //template <>
00359 //void BeagleCPUSSEImpl<double>::calcPartialsPartialsAutoScaling(double* destP,
00360 //                                                                    const double*  partials_q,
00361 //                                                                    const double*  matrices_q,
00362 //                                                                    const double*  partials_r,
00363 //                                                                    const double*  matrices_r,
00364 //                                                                    int* activateScaling) {
00365 //    // TODO: implement calcPartialsPartialsAutoScaling with SSE
00366 //    BeagleCPUImpl<double>::calcPartialsPartialsAutoScaling(destP,
00367 //                                                                partials_q,
00368 //                                                                matrices_q,
00369 //                                                                partials_r,
00370 //                                                                matrices_r,
00371 //                                                                activateScaling);
00372 //}
00373     
00374 BEAGLE_CPU_SSE_TEMPLATE
00375 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex,
00376                                                            const int childIndex,
00377                                                            const int probIndex,
00378                                                            const int categoryWeightsIndex,
00379                                                            const int stateFrequenciesIndex,
00380                                                            const int scalingFactorsIndex,
00381                                                            double* outSumLogLikelihood) {
00382 return BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(
00383                                                     parIndex,
00384                                                     childIndex,
00385                                                     probIndex,
00386                                                     categoryWeightsIndex,
00387                                                     stateFrequenciesIndex,
00388                                                     scalingFactorsIndex,
00389                                                     outSumLogLikelihood);
00390 }
00391 
00392 //template <>
00393 //    int BeagleCPUSSEImpl<double>::calcEdgeLogLikelihoods(const int parIndex,
00394 //                                                                const int childIndex,
00395 //                                                                const int probIndex,
00396 //                                                                const int categoryWeightsIndex,
00397 //                                                                const int stateFrequenciesIndex,
00398 //                                                                const int scalingFactorsIndex,
00399 //                                                                double* outSumLogLikelihood) {
00400 //    // TODO: implement derivatives for calculateEdgeLnL
00401 //
00402 //    int returnCode = BEAGLE_SUCCESS;
00403 //
00404 //    assert(parIndex >= kTipCount);
00405 //
00406 //    const double* cl_r = gPartials[parIndex];
00407 //    double* cl_p = integrationTmp;
00408 //    const double* transMatrix = gTransitionMatrices[probIndex];
00409 //    const double* wt = gCategoryWeights[categoryWeightsIndex];
00410 //    const double* freqs = gStateFrequencies[stateFrequenciesIndex];
00411 //
00412 //    memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
00413 //
00414 //    if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
00415 //
00416 //        const int* statesChild = gTipStates[childIndex];
00417 //
00418 //              int w = 0;
00419 //              V_Real *vcl_r = (V_Real *)cl_r;
00420 //              for(int l = 0; l < kCategoryCount; l++) {
00421 //
00422 //                      VecUnion vu_m[OFFSET][2];
00423 //                      SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
00424 //
00425 //           V_Real *vcl_p = (V_Real *)cl_p;
00426 //
00427 //           for(int k = 0; k < kPatternCount; k++) {
00428 //
00429 //                const int stateChild = statesChild[k];
00430 //                              V_Real vwt = VEC_SPLAT(wt[l]);
00431 //
00432 //                              V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt);
00433 //                *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p);
00434 //
00435 //                              wtdPartials = VEC_MULT(*vcl_r++, vwt);
00436 //                *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p);
00437 //            }
00438 //           w += OFFSET*4;
00439 //           vcl_r += 2 * kExtraPatterns;
00440 //        }
00441 //    } else { // Integrate against a partial at the child
00442 //
00443 //        const double* cl_q = gPartials[childIndex];
00444 //        V_Real * vcl_r = (V_Real *)cl_r;
00445 //        int v = 0;
00446 //        int w = 0;
00447 //
00448 //        for(int l = 0; l < kCategoryCount; l++) {
00449 //
00450 //              V_Real * vcl_p = (V_Real *)cl_p;
00451 //
00452 //                      VecUnion vu_m[OFFSET][2];
00453 //                      SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
00454 //
00455 //            for(int k = 0; k < kPatternCount; k++) {
00456 //                V_Real vclp_01, vclp_23;
00457 //                              V_Real vwt = VEC_SPLAT(wt[l]);
00458 //
00459 //                              V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3;
00460 //                              SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v);
00461 //
00462 //                              vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx);
00463 //                              vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01);
00464 //                              vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01);
00465 //                              vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01);
00466 //                              vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx);
00467 //                              vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23);
00468 //                              vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23);
00469 //                              vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23);
00470 //                              vclp_01 = VEC_MULT(vclp_01, vwt);
00471 //                              vclp_23 = VEC_MULT(vclp_23, vwt);
00472 //
00473 //                              *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p);
00474 //                              *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p);
00475 //
00476 //                v += 4;
00477 //            }
00478 //            w += 4*OFFSET;
00479 //            if (kExtraPatterns) {
00480 //              vcl_r += 2 * kExtraPatterns;
00481 //              v += 4 * kExtraPatterns;
00482 //            }
00483 //
00484 //        }
00485 //    }
00486 //
00487 //    int u = 0;
00488 //    for(int k = 0; k < kPatternCount; k++) {
00489 //        double sumOverI = 0.0;
00490 //        for(int i = 0; i < kStateCount; i++) {
00491 //            sumOverI += freqs[i] * cl_p[u];
00492 //            u++;
00493 //        }
00494 //
00495 //        outLogLikelihoodsTmp[k] = log(sumOverI);
00496 //    }
00497 //
00498 //
00499 //    if (scalingFactorsIndex != BEAGLE_OP_NONE) {
00500 //        const double* scalingFactors = gScaleBuffers[scalingFactorsIndex];
00501 //        for(int k=0; k < kPatternCount; k++)
00502 //            outLogLikelihoodsTmp[k] += scalingFactors[k];
00503 //    }
00504 //
00505 //    *outSumLogLikelihood = 0.0;
00506 //    for (int i = 0; i < kPatternCount; i++) {
00507 //        *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
00508 //    }
00509 //    
00510 //    if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
00511 //        returnCode = BEAGLE_ERROR_FLOATING_POINT;
00512 //
00513 //    return returnCode;
00514 //}
00515 
00516 BEAGLE_CPU_SSE_TEMPLATE
00517 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getPaddedPatternsModulus() {
00518         return 1;  // We currently do not vectorize across patterns
00519 }
00520     
00521 BEAGLE_CPU_SSE_TEMPLATE
00522 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getName() {
00523         return  getBeagleCPUSSEName<float>();
00524 }
00525 
00526 BEAGLE_CPU_SSE_TEMPLATE
00527 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getName() {
00528     return  getBeagleCPUSSEName<double>();
00529 }
00530     
00531 BEAGLE_CPU_SSE_TEMPLATE
00532 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getFlags() {
00533         return  BEAGLE_FLAG_COMPUTATION_SYNCH |
00534             BEAGLE_FLAG_THREADING_NONE |
00535             BEAGLE_FLAG_PROCESSOR_CPU |
00536             BEAGLE_FLAG_PRECISION_SINGLE |
00537             BEAGLE_FLAG_VECTOR_SSE;
00538 }
00539 
00540 BEAGLE_CPU_SSE_TEMPLATE
00541 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getFlags() {
00542     return  BEAGLE_FLAG_COMPUTATION_SYNCH |
00543             BEAGLE_FLAG_THREADING_NONE |
00544             BEAGLE_FLAG_PROCESSOR_CPU |
00545             BEAGLE_FLAG_PRECISION_DOUBLE |
00546             BEAGLE_FLAG_VECTOR_SSE;
00547 }
00548 
00549 
00551 // BeagleImplFactory public methods
00552 
00553 BEAGLE_CPU_FACTORY_TEMPLATE
00554 BeagleImpl* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
00555                                              int partialsBufferCount,
00556                                              int compactBufferCount,
00557                                              int stateCount,
00558                                              int patternCount,
00559                                              int eigenBufferCount,
00560                                              int matrixBufferCount,
00561                                              int categoryCount,
00562                                              int scaleBufferCount,
00563                                              int resourceNumber,
00564                                              long preferenceFlags,
00565                                              long requirementFlags,
00566                                              int* errorCode) {
00567 
00568     if (!CPUSupportsSSE())
00569         return NULL;
00570     
00571         if (stateCount & 1) { // is odd
00572         BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>* impl =
00573         new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>();
00574         
00575         
00576         try {
00577             if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
00578                                      patternCount, eigenBufferCount, matrixBufferCount,
00579                                      categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
00580                 return impl;
00581         }
00582         catch(...) {
00583             if (DEBUGGING_OUTPUT)
00584                 std::cerr << "exception in initialize\n";
00585             delete impl;
00586             throw;
00587         }
00588         
00589         delete impl;        
00590         } else {
00591         BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>* impl =
00592                 new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>();
00593 
00594 
00595         try {
00596             if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
00597                                      patternCount, eigenBufferCount, matrixBufferCount,
00598                                      categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
00599                 return impl;
00600         }
00601         catch(...) {
00602             if (DEBUGGING_OUTPUT)
00603                 std::cerr << "exception in initialize\n";
00604             delete impl;
00605             throw;
00606         }
00607 
00608         delete impl;
00609     }
00610     
00611     return NULL;
00612 }
00613 
00614 BEAGLE_CPU_FACTORY_TEMPLATE
00615 const char* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
00616         return getBeagleCPUSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
00617 }
00618 
00619 template <>
00620 const long BeagleCPUSSEImplFactory<double>::getFlags() {
00621     return BEAGLE_FLAG_COMPUTATION_SYNCH |
00622            BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
00623            BEAGLE_FLAG_THREADING_NONE |
00624            BEAGLE_FLAG_PROCESSOR_CPU |
00625            BEAGLE_FLAG_VECTOR_SSE |
00626            BEAGLE_FLAG_PRECISION_DOUBLE |
00627            BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
00628            BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
00629            BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00630 }
00631 
00632 template <>
00633 const long BeagleCPUSSEImplFactory<float>::getFlags() {
00634     return BEAGLE_FLAG_COMPUTATION_SYNCH |
00635            BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
00636            BEAGLE_FLAG_THREADING_NONE |
00637            BEAGLE_FLAG_PROCESSOR_CPU |
00638            BEAGLE_FLAG_VECTOR_SSE |
00639            BEAGLE_FLAG_PRECISION_SINGLE |
00640            BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
00641            BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
00642            BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00643 }
00644 
00645 // Code to save:
00646 //      int j = 0;
00667 //      sum1_vec = VEC_SETZERO();
00668 //      sum2_vec = VEC_SETZERO();
00672 // Next snippet
00673 //#if 1
00674 //                VEC_STORE_SCALAR(destP + u,
00675 //                              VEC_MULT(
00676 //                                              VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
00677 //                                              VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
00678 //                              ));
00679 //#else
00680 //                VecUnion t1, t2;
00681 //                t1.vx = sum1_vec;
00682 //                t2.vx = sum2_vec;
00683 //                destP[u] = (t1.x[0] + t1.x[1] //+ endSum1
00684 //                              ) * (t2.x[0] + t2.x[1] //+ endSum2
00685 //                                              );
00686 //#endif
00687 
00688 }
00689 }
00690 
00691 #endif //BEAGLE_CPU_SSE_IMPL_HPP