HMSBEAGLE  1.0.0
libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp
00001 /*
00002  *  BeagleCPU4StateSSEImpl.cpp
00003  *  BEAGLE
00004  *
00005  * Copyright 2009 Phylogenetic Likelihood Working Group
00006  *
00007  * This file is part of BEAGLE.
00008  *
00009  * BEAGLE is free software: you can redistribute it and/or modify
00010  * it under the terms of the GNU Lesser General Public License as
00011  * published by the Free Software Foundation, either version 3 of
00012  * the License, or (at your option) any later version.
00013  *
00014  * BEAGLE is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with BEAGLE.  If not, see
00021  * <http://www.gnu.org/licenses/>.
00022  *
00023  * @author Marc Suchard
00024  * @author David Swofford
00025  */
00026 
00027 #ifndef BEAGLE_CPU_4STATE_SSE_IMPL_HPP
00028 #define BEAGLE_CPU_4STATE_SSE_IMPL_HPP
00029 
00030 
00031 #ifdef HAVE_CONFIG_H
00032 #include "libhmsbeagle/config.h"
00033 #endif
00034 
00035 #include <cstdio>
00036 #include <cstdlib>
00037 #include <iostream>
00038 #include <cstring>
00039 #include <cmath>
00040 #include <cassert>
00041 
00042 #include "libhmsbeagle/beagle.h"
00043 #include "libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h"
00044 #include "libhmsbeagle/CPU/SSEDefinitions.h"
00045 
00046 /* Loads partials into SSE vectors */
00047 #if 0
00048 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
00049                 dest##0 = VEC_SPLAT(src[v + 0]); \
00050                 dest##1 = VEC_SPLAT(src[v + 1]); \
00051                 dest##2 = VEC_SPLAT(src[v + 2]); \
00052                 dest##3 = VEC_SPLAT(src[v + 3]);
00053 #else // Load four partials in two 128-bit memory transactions
00054 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
00055                 V_Real tmp_##dest##01, tmp_##dest##23; \
00056                 tmp_##dest##01 = _mm_load_pd(&src[v + 0]); \
00057                 tmp_##dest##23 = _mm_load_pd(&src[v + 2]); \
00058                 dest##0 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(0,0)); \
00059                 dest##1 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(1,1)); \
00060                 dest##2 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(0,0)); \
00061                 dest##3 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(1,1));
00062 #endif
00063 
00064 /* Loads (transposed) finite-time transition matrices into SSE vectors */
00065 #define SSE_PREFETCH_MATRICES(src_m1, src_m2, dest_vu_m1, dest_vu_m2) \
00066         const double *m1 = (src_m1); \
00067         const double *m2 = (src_m2); \
00068         for (int i = 0; i < OFFSET; i++, m1++, m2++) { \
00069                 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
00070                 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
00071                 dest_vu_m2[i][0].x[0] = m2[0*OFFSET]; \
00072                 dest_vu_m2[i][0].x[1] = m2[1*OFFSET]; \
00073                 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
00074                 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
00075                 dest_vu_m2[i][1].x[0] = m2[2*OFFSET]; \
00076                 dest_vu_m2[i][1].x[1] = m2[3*OFFSET]; \
00077         }
00078 
00079 #define SSE_PREFETCH_MATRIX(src_m1, dest_vu_m1) \
00080         const double *m1 = (src_m1); \
00081         for (int i = 0; i < OFFSET; i++, m1++) { \
00082                 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
00083                 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
00084                 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
00085                 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
00086         }
00087 
00088 namespace beagle {
00089 namespace cpu {
00090 
00091 
00092 BEAGLE_CPU_FACTORY_TEMPLATE
00093 inline const char* getBeagleCPU4StateSSEName(){ return "CPU-4State-SSE-Unknown"; };
00094 
00095 template<>
00096 inline const char* getBeagleCPU4StateSSEName<double>(){ return "CPU-4State-SSE-Double"; };
00097 
00098 template<>
00099 inline const char* getBeagleCPU4StateSSEName<float>(){ return "CPU-4State-SSE-Single"; };
00100     
00101 /*
00102  * Calculates partial likelihoods at a node when both children have states.
00103  */
00104 
00105 BEAGLE_CPU_4_SSE_TEMPLATE
00106 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(float* destP,
00107                                      const int* states_q,
00108                                      const float* matrices_q,
00109                                      const int* states_r,
00110                                      const float* matrices_r) {
00111 
00112                                                                          BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(destP,
00113                                      states_q,
00114                                      matrices_q,
00115                                      states_r,
00116                                      matrices_r);
00117 
00118                                                                          }
00119 
00120 
00121 BEAGLE_CPU_4_SSE_TEMPLATE
00122 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesStates(double* destP,
00123                                      const int* states_q,
00124                                      const double* matrices_q,
00125                                      const int* states_r,
00126                                      const double* matrices_r) {
00127 
00128         VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00129 
00130     int w = 0;
00131         V_Real *destPvec = (V_Real *)destP;
00132 
00133     for (int l = 0; l < kCategoryCount; l++) {
00134 
00135         SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00136 
00137         for (int k = 0; k < kPatternCount; k++) {
00138 
00139             const int state_q = states_q[k];
00140             const int state_r = states_r[k];
00141 
00142             *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx);
00143             *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx);
00144 
00145         }
00146 
00147         w += OFFSET*4;
00148         if (kExtraPatterns)
00149                 destPvec += kExtraPatterns * 2;
00150     }
00151 }
00152 
00153 /*
00154  * Calculates partial likelihoods at a node when one child has states and one has partials.
00155    SSE version
00156  */
00157 BEAGLE_CPU_4_SSE_TEMPLATE
00158 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(float* destP,
00159                                        const int* states_q,
00160                                        const float* matrices_q,
00161                                        const float* partials_r,
00162                                        const float* matrices_r) {
00163         BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(
00164                                                                            destP,
00165                                                                            states_q,
00166                                                                            matrices_q,
00167                                                                            partials_r,
00168                                                                            matrices_r);
00169 }
00170 
00171 
00172 
00173 BEAGLE_CPU_4_SSE_TEMPLATE
00174 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartials(double* destP,
00175                                        const int* states_q,
00176                                        const double* matrices_q,
00177                                        const double* partials_r,
00178                                        const double* matrices_r) {
00179 
00180     int v = 0;
00181     int w = 0;
00182 
00183         VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00184         V_Real *destPvec = (V_Real *)destP;
00185         V_Real destr_01, destr_23;
00186 
00187     for (int l = 0; l < kCategoryCount; l++) {
00188 
00189         SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00190 
00191         for (int k = 0; k < kPatternCount; k++) {
00192 
00193             const int state_q = states_q[k];
00194             V_Real vp0, vp1, vp2, vp3;
00195             SSE_PREFETCH_PARTIALS(vp,partials_r,v);
00196 
00197                         destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
00198                         destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
00199                         destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
00200                         destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
00201                         destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
00202                         destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
00203                         destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
00204                         destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
00205 
00206             *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01);
00207             *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23);
00208 
00209             v += 4;
00210         }
00211         w += OFFSET*4;
00212         if (kExtraPatterns) {
00213                 destPvec += kExtraPatterns * 2;
00214                 v += kExtraPatterns * 4;
00215         }
00216     }
00217 }
00218 
00219 BEAGLE_CPU_4_SSE_TEMPLATE
00220 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(float* destP,
00221                                 const int* states1,
00222                                 const float* __restrict matrices1,
00223                                 const float* __restrict partials2,
00224                                 const float* __restrict matrices2,
00225                                 const float* __restrict scaleFactors) {
00226         BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(
00227                                                                            destP,
00228                                                                            states1,
00229                                                                            matrices1,
00230                                                                            partials2,
00231                                                                            matrices2,
00232                                                                            scaleFactors);
00233 }
00234 
00235 BEAGLE_CPU_4_SSE_TEMPLATE
00236 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartialsFixedScaling(double* destP,
00237                                 const int* states_q,
00238                                 const double* __restrict matrices_q,
00239                                 const double* __restrict partials_r,
00240                                 const double* __restrict matrices_r,
00241                                 const double* __restrict scaleFactors) {
00242 
00243 
00244     int v = 0;
00245     int w = 0;
00246 
00247         VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00248         V_Real *destPvec = (V_Real *)destP;
00249         V_Real destr_01, destr_23;
00250 
00251     for (int l = 0; l < kCategoryCount; l++) {
00252 
00253         SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00254 
00255         for (int k = 0; k < kPatternCount; k++) {
00256 
00257                 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
00258 
00259             const int state_q = states_q[k];
00260             V_Real vp0, vp1, vp2, vp3;
00261             SSE_PREFETCH_PARTIALS(vp,partials_r,v);
00262 
00263                         destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
00264                         destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
00265                         destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
00266                         destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
00267                         destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
00268                         destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
00269                         destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
00270                         destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
00271 
00272             *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][0].vx, destr_01), scaleFactor);
00273             *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][1].vx, destr_23), scaleFactor);
00274 
00275             v += 4;
00276         }
00277         w += OFFSET*4;
00278         if (kExtraPatterns) {
00279                 destPvec += kExtraPatterns * 2;
00280                 v += kExtraPatterns * 4;
00281         }
00282     }
00283 }
00284 
00285 BEAGLE_CPU_4_SSE_TEMPLATE
00286 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(float* destP,
00287                                                   const float*  partials_q,
00288                                                   const float*  matrices_q,
00289                                                   const float*  partials_r,
00290                                                   const float*  matrices_r) {
00291 
00292         BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(destP,
00293                                                   partials_q,
00294                                                   matrices_q,
00295                                                   partials_r,
00296                                                   matrices_r);
00297 }
00298 
00299 BEAGLE_CPU_4_SSE_TEMPLATE
00300 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartials(double* destP,
00301                                                   const double*  partials_q,
00302                                                   const double*  matrices_q,
00303                                                   const double*  partials_r,
00304                                                   const double*  matrices_r) {
00305 
00306     int v = 0;
00307     int w = 0;
00308 
00309     V_Real      destq_01, destq_23, destr_01, destr_23;
00310         VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00311         V_Real *destPvec = (V_Real *)destP;
00312 
00313     for (int l = 0; l < kCategoryCount; l++) {
00314 
00315                 /* Load transition-probability matrices into vectors */
00316         SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00317 
00318         for (int k = 0; k < kPatternCount; k++) {
00319             
00320 #           if 1 && !defined(_WIN32)
00321             __builtin_prefetch (&partials_q[v+64]);
00322             __builtin_prefetch (&partials_r[v+64]);
00323 //            __builtin_prefetch (destPvec+32,1,0);
00324 #           endif
00325 
00326                 V_Real vpq_0, vpq_1, vpq_2, vpq_3;
00327                 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
00328 
00329                 V_Real vpr_0, vpr_1, vpr_2, vpr_3;
00330                 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
00331 
00332 #                       if 1    /* This would probably be faster on PPC/Altivec, which has a fused multiply-add
00333                                    vector instruction */
00334 
00335                         destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
00336                         destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
00337                         destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
00338                         destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
00339                         destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
00340                         destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
00341                         destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
00342                         destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
00343 
00344                         destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
00345                         destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
00346                         destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
00347                         destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
00348                         destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
00349                         destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
00350                         destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
00351                         destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
00352 
00353 #                       else    /* SSE doesn't have a fused multiply-add, so a slight speed gain should be
00354                        achieved by decoupling these operations to avoid dependency stalls */
00355 
00356                         V_Real a, b, c, d;
00357 
00358                         a = VEC_MULT(vpq_0, vu_mq[0][0].vx);
00359                         b = VEC_MULT(vpq_2, vu_mq[2][0].vx);
00360                         c = VEC_MULT(vpq_0, vu_mq[0][1].vx);
00361                         d = VEC_MULT(vpq_2, vu_mq[2][1].vx);
00362                         a = VEC_MADD(vpq_1, vu_mq[1][0].vx, a);
00363                         b = VEC_MADD(vpq_3, vu_mq[3][0].vx, b);
00364                         c = VEC_MADD(vpq_1, vu_mq[1][1].vx, c);
00365                         d = VEC_MADD(vpq_3, vu_mq[3][1].vx, d);
00366                         destq_01 = VEC_ADD(a, b);
00367                         destq_23 = VEC_ADD(c, d);
00368 
00369                         a = VEC_MULT(vpr_0, vu_mr[0][0].vx);
00370                         b = VEC_MULT(vpr_2, vu_mr[2][0].vx);
00371                         c = VEC_MULT(vpr_0, vu_mr[0][1].vx);
00372                         d = VEC_MULT(vpr_2, vu_mr[2][1].vx);
00373                         a = VEC_MADD(vpr_1, vu_mr[1][0].vx, a);
00374                         b = VEC_MADD(vpr_3, vu_mr[3][0].vx, b);
00375                         c = VEC_MADD(vpr_1, vu_mr[1][1].vx, c);
00376                         d = VEC_MADD(vpr_3, vu_mr[3][1].vx, d);
00377                         destr_01 = VEC_ADD(a, b);
00378                         destr_23 = VEC_ADD(c, d);
00379 
00380 #                       endif
00381 
00382 #                       if 1//
00383             destPvec[0] = VEC_MULT(destq_01, destr_01);
00384             destPvec[1] = VEC_MULT(destq_23, destr_23);
00385             destPvec += 2;
00386 
00387 #                       else    /* VEC_STORE did demonstrate a measurable performance gain as
00388                                            it copies all (2/4) values to memory simultaneously;
00389                                            I can no longer reproduce the performance gain (?) */
00390 
00391                         VEC_STORE(destP + v + 0,VEC_MULT(destq_01, destr_01));
00392                         VEC_STORE(destP + v + 2,VEC_MULT(destq_23, destr_23));
00393 
00394 #                       endif
00395 
00396             v += 4;
00397         }
00398         w += OFFSET*4;
00399         if (kExtraPatterns) {
00400                 destPvec += kExtraPatterns * 2;
00401                 v += kExtraPatterns * 4;
00402         }
00403     }
00404 }
00405 
00406 BEAGLE_CPU_4_SSE_TEMPLATE
00407 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(float* destP,
00408                                         const float*  child0Partials,
00409                                         const float*  child0TransMat,
00410                                         const float*  child1Partials,
00411                                         const float*  child1TransMat,
00412                                         const float*  scaleFactors) {
00413 
00414         BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(
00415                         destP,
00416                         child0Partials,
00417                         child0TransMat,
00418                         child1Partials,
00419                         child1TransMat,
00420                         scaleFactors);
00421 }
00422 
00423 BEAGLE_CPU_4_SSE_TEMPLATE
00424 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(double* destP,
00425                                                                         const double* partials_q,
00426                                                                         const double* matrices_q,
00427                                                                         const double* partials_r,
00428                                                                         const double* matrices_r,
00429                                                                         const double* scaleFactors) {
00430 
00431     int v = 0;
00432     int w = 0;
00433 
00434     V_Real      destq_01, destq_23, destr_01, destr_23;
00435         VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
00436         V_Real *destPvec = (V_Real *)destP;
00437 
00438         for (int l = 0; l < kCategoryCount; l++) {
00439 
00440                 /* Load transition-probability matrices into vectors */
00441         SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
00442 
00443         for (int k = 0; k < kPatternCount; k++) {
00444 
00445 #           if 1 && !defined(_WIN32)
00446             __builtin_prefetch (&partials_q[v+64]);
00447             __builtin_prefetch (&partials_r[v+64]);
00448             //            __builtin_prefetch (destPvec+32,1,0);
00449 #           endif
00450             
00451             // Prefetch scale factor
00452 //            const V_Real scaleFactor = VEC_LOAD_SCALAR(scaleFactors + k);
00453                 // Option below appears faster, why?
00454                 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
00455 
00456                 V_Real vpq_0, vpq_1, vpq_2, vpq_3;
00457                 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
00458 
00459                 V_Real vpr_0, vpr_1, vpr_2, vpr_3;
00460                 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
00461 
00462                 // TODO Make below into macro since this repeats from other calcPPs
00463                         destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
00464                         destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
00465                         destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
00466                         destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
00467                         destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
00468                         destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
00469                         destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
00470                         destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
00471 
00472                         destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
00473                         destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
00474                         destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
00475                         destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
00476                         destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
00477                         destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
00478                         destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
00479                         destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
00480 
00481             destPvec[0] = VEC_DIV(VEC_MULT(destq_01, destr_01), scaleFactor);
00482             destPvec[1] = VEC_DIV(VEC_MULT(destq_23, destr_23), scaleFactor);
00483 
00484             destPvec += 2;
00485             v += 4;
00486         }
00487         w += OFFSET*4;
00488         if (kExtraPatterns) {
00489                 destPvec += kExtraPatterns * 2;
00490                 v += kExtraPatterns * 4;
00491         }
00492     }
00493 }
00494 
00495     
00496 BEAGLE_CPU_4_SSE_TEMPLATE
00497 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(float* destP,
00498                                                          const float*  partials_q,
00499                                                          const float*  matrices_q,
00500                                                          const float*  partials_r,
00501                                                          const float*  matrices_r,
00502                                                                  int* activateScaling) {
00503     BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(destP,
00504                                                      partials_q,
00505                                                      matrices_q,
00506                                                      partials_r,
00507                                                      matrices_r,
00508                                                      activateScaling);
00509 }
00510 
00511 BEAGLE_CPU_4_SSE_TEMPLATE
00512 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP,
00513                                                                     const double*  partials_q,
00514                                                                     const double*  matrices_q,
00515                                                                     const double*  partials_r,
00516                                                                     const double*  matrices_r,
00517                                                                     int* activateScaling) {
00518     // TODO: implement calcPartialsPartialsAutoScaling with SSE
00519     BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
00520                                                                 partials_q,
00521                                                                 matrices_q,
00522                                                                 partials_r,
00523                                                                 matrices_r,
00524                                                                 activateScaling);
00525 }
00526     
00527 BEAGLE_CPU_4_SSE_TEMPLATE
00528 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(const int parIndex,
00529                                                           const int childIndex,
00530                                                           const int probIndex,
00531                                                           const int categoryWeightsIndex,
00532                                                           const int stateFrequenciesIndex,
00533                                                           const int scalingFactorsIndex,
00534                                                           double* outSumLogLikelihood) {
00535     return BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(
00536                                                               parIndex,
00537                                                               childIndex,
00538                                                               probIndex,
00539                                                               categoryWeightsIndex,
00540                                                               stateFrequenciesIndex,
00541                                                               scalingFactorsIndex,
00542                                                               outSumLogLikelihood);
00543 }
00544 
00545 BEAGLE_CPU_4_SSE_TEMPLATE
00546 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex,
00547                                                             const int childIndex,
00548                                                             const int probIndex,
00549                                                             const int categoryWeightsIndex,
00550                                                             const int stateFrequenciesIndex,
00551                                                             const int scalingFactorsIndex,
00552                                                             double* outSumLogLikelihood) {
00553     // TODO: implement derivatives for calculateEdgeLnL
00554 
00555     int returnCode = BEAGLE_SUCCESS;
00556 
00557     assert(parIndex >= kTipCount);
00558 
00559     const double* cl_r = gPartials[parIndex];
00560     double* cl_p = integrationTmp;
00561     const double* transMatrix = gTransitionMatrices[probIndex];
00562     const double* wt = gCategoryWeights[categoryWeightsIndex];
00563     const double* freqs = gStateFrequencies[stateFrequenciesIndex];
00564 
00565     memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
00566 
00567     if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
00568 
00569         const int* statesChild = gTipStates[childIndex];
00570 
00571         int w = 0;
00572         V_Real *vcl_r = (V_Real *)cl_r;
00573         for(int l = 0; l < kCategoryCount; l++) {
00574 
00575             VecUnion vu_m[OFFSET][2];
00576             SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
00577 
00578            V_Real *vcl_p = (V_Real *)cl_p;
00579 
00580            for(int k = 0; k < kPatternCount; k++) {
00581 
00582                 const int stateChild = statesChild[k];
00583                 V_Real vwt = VEC_SPLAT(wt[l]);
00584 
00585                 V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt);
00586                 *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p);
00587 
00588                 wtdPartials = VEC_MULT(*vcl_r++, vwt);
00589                 *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p);
00590             }
00591            w += OFFSET*4;
00592            vcl_r += 2 * kExtraPatterns;
00593         }
00594     } else { // Integrate against a partial at the child
00595 
00596         const double* cl_q = gPartials[childIndex];
00597         V_Real * vcl_r = (V_Real *)cl_r;
00598         int v = 0;
00599         int w = 0;
00600 
00601         for(int l = 0; l < kCategoryCount; l++) {
00602 
00603             V_Real * vcl_p = (V_Real *)cl_p;
00604 
00605             VecUnion vu_m[OFFSET][2];
00606             SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
00607 
00608             for(int k = 0; k < kPatternCount; k++) {
00609                 V_Real vclp_01, vclp_23;
00610                 V_Real vwt = VEC_SPLAT(wt[l]);
00611 
00612                 V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3;
00613                 SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v);
00614 
00615                 vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx);
00616                 vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01);
00617                 vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01);
00618                 vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01);
00619                 vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx);
00620                 vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23);
00621                 vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23);
00622                 vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23);
00623                 vclp_01 = VEC_MULT(vclp_01, vwt);
00624                 vclp_23 = VEC_MULT(vclp_23, vwt);
00625 
00626                 *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p);
00627                 *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p);
00628 
00629                 v += 4;
00630             }
00631             w += 4*OFFSET;
00632             if (kExtraPatterns) {
00633                 vcl_r += 2 * kExtraPatterns;
00634                 v += 4 * kExtraPatterns;
00635             }
00636 
00637         }
00638     }
00639 
00640     int u = 0;
00641     for(int k = 0; k < kPatternCount; k++) {
00642         double sumOverI = 0.0;
00643         for(int i = 0; i < kStateCount; i++) {
00644             sumOverI += freqs[i] * cl_p[u];
00645             u++;
00646         }
00647 
00648         outLogLikelihoodsTmp[k] = log(sumOverI);
00649     }
00650 
00651 
00652     if (scalingFactorsIndex != BEAGLE_OP_NONE) {
00653         const double* scalingFactors = gScaleBuffers[scalingFactorsIndex];
00654         for(int k=0; k < kPatternCount; k++)
00655             outLogLikelihoodsTmp[k] += scalingFactors[k];
00656     }
00657 
00658     *outSumLogLikelihood = 0.0;
00659     for (int i = 0; i < kPatternCount; i++) {
00660         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
00661     }
00662 
00663     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
00664         returnCode = BEAGLE_ERROR_FLOATING_POINT;
00665         
00666     return returnCode;
00667 }
00668 
00669 
00670 BEAGLE_CPU_4_SSE_TEMPLATE
00671 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getPaddedPatternsModulus() {
00672         return 1;  // We currently do not vectorize across patterns
00673 //      return 4;  // For single-precision, can operate on 4 patterns at a time
00674         // TODO Vectorize final log operations over patterns
00675 }
00676     
00677 BEAGLE_CPU_4_SSE_TEMPLATE
00678 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getPaddedPatternsModulus() {
00679 //      return 2;  // For double-precision, can operate on 2 patterns at a time
00680         return 1;  // We currently do not vectorize across patterns
00681 }
00682 
00683 BEAGLE_CPU_4_SSE_TEMPLATE
00684 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getName() {
00685         return  getBeagleCPU4StateSSEName<float>();
00686 }
00687 
00688 BEAGLE_CPU_4_SSE_TEMPLATE
00689 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getName() {
00690     return  getBeagleCPU4StateSSEName<double>();
00691 }
00692 
00693     
00694 BEAGLE_CPU_4_SSE_TEMPLATE
00695 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getFlags() {
00696         return  BEAGLE_FLAG_COMPUTATION_SYNCH |
00697             BEAGLE_FLAG_THREADING_NONE |
00698             BEAGLE_FLAG_PROCESSOR_CPU |
00699             BEAGLE_FLAG_PRECISION_SINGLE |
00700             BEAGLE_FLAG_VECTOR_SSE;
00701 }
00702 
00703 BEAGLE_CPU_4_SSE_TEMPLATE
00704 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getFlags() {
00705     return  BEAGLE_FLAG_COMPUTATION_SYNCH |
00706             BEAGLE_FLAG_THREADING_NONE |
00707             BEAGLE_FLAG_PROCESSOR_CPU |
00708             BEAGLE_FLAG_PRECISION_DOUBLE |
00709             BEAGLE_FLAG_VECTOR_SSE;
00710 }
00711 
00712 
00713 
00715 // BeagleImplFactory public methods
00716 
00717 BEAGLE_CPU_FACTORY_TEMPLATE
00718 BeagleImpl* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
00719                                              int partialsBufferCount,
00720                                              int compactBufferCount,
00721                                              int stateCount,
00722                                              int patternCount,
00723                                              int eigenBufferCount,
00724                                              int matrixBufferCount,
00725                                              int categoryCount,
00726                                              int scaleBufferCount,
00727                                              int resourceNumber,
00728                                              long preferenceFlags,
00729                                              long requirementFlags,
00730                                              int* errorCode) {
00731 
00732     if (stateCount != 4) {
00733         return NULL;
00734     }
00735 
00736     BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>* impl =
00737                 new BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>();
00738 
00739     if (!CPUSupportsSSE()) {
00740         delete impl;
00741         return NULL;
00742     }
00743 
00744     try {
00745         if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
00746                                  patternCount, eigenBufferCount, matrixBufferCount,
00747                                  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
00748             return impl;
00749     }
00750     catch(...) {
00751         if (DEBUGGING_OUTPUT)
00752             std::cerr << "exception in initialize\n";
00753         delete impl;
00754         throw;
00755     }
00756 
00757     delete impl;
00758 
00759     return NULL;
00760 }
00761 
00762 BEAGLE_CPU_FACTORY_TEMPLATE
00763 const char* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
00764         return getBeagleCPU4StateSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
00765 }
00766 
00767 template <>
00768 const long BeagleCPU4StateSSEImplFactory<double>::getFlags() {
00769     return BEAGLE_FLAG_COMPUTATION_SYNCH |
00770            BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
00771            BEAGLE_FLAG_THREADING_NONE |
00772            BEAGLE_FLAG_PROCESSOR_CPU |
00773            BEAGLE_FLAG_VECTOR_SSE |
00774            BEAGLE_FLAG_PRECISION_DOUBLE |
00775            BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
00776            BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL|
00777            BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00778 }
00779 
00780 template <>
00781 const long BeagleCPU4StateSSEImplFactory<float>::getFlags() {
00782     return BEAGLE_FLAG_COMPUTATION_SYNCH |
00783            BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
00784            BEAGLE_FLAG_THREADING_NONE |
00785            BEAGLE_FLAG_PROCESSOR_CPU |
00786            BEAGLE_FLAG_VECTOR_SSE |
00787            BEAGLE_FLAG_PRECISION_SINGLE |
00788            BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
00789            BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
00790            BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00791 }
00792 
00793 
00794 }
00795 }
00796 
00797 #endif //BEAGLE_CPU_4STATE_SSE_IMPL_HPP