HMSBEAGLE  1.0.0
libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp
00001 
00002 /*
00003  *  BeagleCPU4StateImpl.cpp
00004  *  BEAGLE
00005  *
00006  * Copyright 2009 Phylogenetic Likelihood Working Group
00007  *
00008  * This file is part of BEAGLE.
00009  *
00010  * BEAGLE is free software: you can redistribute it and/or modify
00011  * it under the terms of the GNU Lesser General Public License as
00012  * published by the Free Software Foundation, either version 3 of
00013  * the License, or (at your option) any later version.
00014  *
00015  * BEAGLE is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with BEAGLE.  If not, see
00022  * <http://www.gnu.org/licenses/>.
00023  *
00024  * @author Andrew Rambaut
00025  * @author Marc Suchard
00026  * @author Daniel Ayres
00027  * @author Mark Holder
00028  * @author Aaron Darling
00029  */
00030 
00031 #ifndef BEAGLE_CPU_4STATE_IMPL_HPP
00032 #define BEAGLE_CPU_4STATE_IMPL_HPP
00033 
00034 #ifdef HAVE_CONFIG_H
00035 #include "libhmsbeagle/config.h"
00036 #endif
00037 
00038 #include <cstdio>
00039 #include <cstdlib>
00040 #include <iostream>
00041 #include <cstring>
00042 #include <cmath>
00043 #include <cassert>
00044 
00045 #include "libhmsbeagle/beagle.h"
00046 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
00047 #include "libhmsbeagle/CPU/BeagleCPU4StateImpl.h"
00048 
00049 #define EXPERIMENTAL_OPENMP
00050 
00051 
00052 #define OFFSET    (4 + T_PAD)    // For easy conversion between 4/5
00053 
00054 #define PREFETCH_MATRIX(num,matrices,w) \
00055     REALTYPE m##num##00, m##num##01, m##num##02, m##num##03, \
00056            m##num##10, m##num##11, m##num##12, m##num##13, \
00057            m##num##20, m##num##21, m##num##22, m##num##23, \
00058            m##num##30, m##num##31, m##num##32, m##num##33; \
00059     m##num##00 = matrices[w + OFFSET*0 + 0]; \
00060     m##num##01 = matrices[w + OFFSET*0 + 1]; \
00061     m##num##02 = matrices[w + OFFSET*0 + 2]; \
00062     m##num##03 = matrices[w + OFFSET*0 + 3]; \
00063     m##num##10 = matrices[w + OFFSET*1 + 0]; \
00064     m##num##11 = matrices[w + OFFSET*1 + 1]; \
00065     m##num##12 = matrices[w + OFFSET*1 + 2]; \
00066     m##num##13 = matrices[w + OFFSET*1 + 3]; \
00067     m##num##20 = matrices[w + OFFSET*2 + 0]; \
00068     m##num##21 = matrices[w + OFFSET*2 + 1]; \
00069     m##num##22 = matrices[w + OFFSET*2 + 2]; \
00070     m##num##23 = matrices[w + OFFSET*2 + 3]; \
00071     m##num##30 = matrices[w + OFFSET*3 + 0]; \
00072     m##num##31 = matrices[w + OFFSET*3 + 1]; \
00073     m##num##32 = matrices[w + OFFSET*3 + 2]; \
00074     m##num##33 = matrices[w + OFFSET*3 + 3];
00075 
00076 #define PREFETCH_PARTIALS(num,partials,v) \
00077     REALTYPE p##num##0, p##num##1, p##num##2, p##num##3; \
00078     p##num##0 = partials[v + 0]; \
00079     p##num##1 = partials[v + 1]; \
00080     p##num##2 = partials[v + 2]; \
00081     p##num##3 = partials[v + 3];
00082 
00083 //#define DO_INTEGRATION(num) \
00084 //    REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \
00085 //    sum##num##0  = m##num##00 * p##num##0; \
00086 //    sum##num##1  = m##num##10 * p##num##0; \
00087 //    sum##num##2  = m##num##20 * p##num##0; \
00088 //    sum##num##3  = m##num##30 * p##num##0; \
00089 // \
00090 //    sum##num##0 += m##num##01 * p##num##1; \
00091 //    sum##num##1 += m##num##11 * p##num##1; \
00092 //    sum##num##2 += m##num##21 * p##num##1; \
00093 //    sum##num##3 += m##num##31 * p##num##1; \
00094 // \
00095 //    sum##num##0 += m##num##02 * p##num##2; \
00096 //    sum##num##1 += m##num##12 * p##num##2; \
00097 //    sum##num##2 += m##num##22 * p##num##2; \
00098 //    sum##num##3 += m##num##32 * p##num##2; \
00099 // \
00100 //    sum##num##0 += m##num##03 * p##num##3; \
00101 //    sum##num##1 += m##num##13 * p##num##3; \
00102 //    sum##num##2 += m##num##23 * p##num##3; \
00103 //    sum##num##3 += m##num##33 * p##num##3;
00104 
00105 #define DO_INTEGRATION(num) \
00106     REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \
00107     sum##num##0  = m##num##00 * p##num##0 + \
00108                    m##num##01 * p##num##1 + \
00109                    m##num##02 * p##num##2 + \
00110                    m##num##03 * p##num##3;  \
00111  \
00112     sum##num##1  = m##num##10 * p##num##0 + \
00113                    m##num##11 * p##num##1 + \
00114                    m##num##12 * p##num##2 + \
00115                    m##num##13 * p##num##3;  \
00116  \
00117     sum##num##2  = m##num##20 * p##num##0 + \
00118                    m##num##21 * p##num##1 + \
00119                    m##num##22 * p##num##2 + \
00120                    m##num##23 * p##num##3;  \
00121 \
00122     sum##num##3  = m##num##30 * p##num##0 + \
00123                    m##num##31 * p##num##1 + \
00124                    m##num##32 * p##num##2 + \
00125                    m##num##33 * p##num##3;
00126 
00127 
00128 namespace beagle {
00129 namespace cpu {
00130 
00131 BEAGLE_CPU_FACTORY_TEMPLATE
00132 inline const char* getBeagleCPU4StateName(){ return "CPU-4State-Unknown"; };
00133 
00134 template<>
00135 inline const char* getBeagleCPU4StateName<double>(){ return "CPU-4State-Double"; };
00136 
00137 template<>
00138 inline const char* getBeagleCPU4StateName<float>(){ return "CPU-4State-Single"; };
00139 
00140 BEAGLE_CPU_TEMPLATE
00141 BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::~BeagleCPU4StateImpl() {
00142     // free all that stuff...
00143     // If you delete partials, make sure not to delete the last element
00144     // which is TEMP_SCRATCH_PARTIAL twice.
00145 }
00146 
00148 // private methods
00149 
00150 /*
00151  * Calculates partial likelihoods at a node when both children have states.
00152  */
00153 BEAGLE_CPU_TEMPLATE
00154 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
00155                                      const int* states1,
00156                                      const REALTYPE* matrices1,
00157                                      const int* states2,
00158                                      const REALTYPE* matrices2) {
00159 
00160 #pragma omp parallel for num_threads(kCategoryCount)
00161     for (int l = 0; l < kCategoryCount; l++) {
00162         int v = l*4*kPaddedPatternCount;
00163         int w = l*4*OFFSET;
00164 
00165         for (int k = 0; k < kPatternCount; k++) {
00166 
00167             const int state1 = states1[k];
00168             const int state2 = states2[k];
00169 
00170             destP[v    ] = matrices1[w            + state1] * 
00171                            matrices2[w            + state2];
00172             destP[v + 1] = matrices1[w + OFFSET*1 + state1] * 
00173                            matrices2[w + OFFSET*1 + state2];
00174             destP[v + 2] = matrices1[w + OFFSET*2 + state1] * 
00175                            matrices2[w + OFFSET*2 + state2];
00176             destP[v + 3] = matrices1[w + OFFSET*3 + state1] * 
00177                            matrices2[w + OFFSET*3 + state2];
00178            v += 4;
00179         }
00180     }
00181 }
00182 
00183 BEAGLE_CPU_TEMPLATE
00184 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
00185                                      const int* states1,
00186                                      const REALTYPE* matrices1,
00187                                      const int* states2,
00188                                      const REALTYPE* matrices2,
00189                                      const REALTYPE* scaleFactors) {
00190     
00191 #pragma omp parallel for num_threads(kCategoryCount)
00192     for (int l = 0; l < kCategoryCount; l++) {
00193         int v = l*4*kPaddedPatternCount;
00194         int w = l*4*OFFSET;
00195         
00196         for (int k = 0; k < kPatternCount; k++) {
00197             
00198             const int state1 = states1[k];
00199             const int state2 = states2[k];
00200             const REALTYPE scaleFactor = scaleFactors[k];
00201             
00202             destP[v    ] = matrices1[w            + state1] * 
00203                            matrices2[w            + state2] / scaleFactor;
00204             destP[v + 1] = matrices1[w + OFFSET*1 + state1] * 
00205                            matrices2[w + OFFSET*1 + state2] / scaleFactor;
00206             destP[v + 2] = matrices1[w + OFFSET*2 + state1] * 
00207                            matrices2[w + OFFSET*2 + state2] / scaleFactor;
00208             destP[v + 3] = matrices1[w + OFFSET*3 + state1] * 
00209                            matrices2[w + OFFSET*3 + state2] / scaleFactor;
00210             v += 4;
00211         }
00212     }
00213 }
00214 
00215 /*
00216  * Calculates partial likelihoods at a node when one child has states and one has partials.
00217  */
00218 BEAGLE_CPU_TEMPLATE
00219 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
00220                                        const int* states1,
00221                                        const REALTYPE* matrices1,
00222                                        const REALTYPE* partials2,
00223                                        const REALTYPE* matrices2) {
00224 
00225 #pragma omp parallel for num_threads(kCategoryCount)
00226     for (int l = 0; l < kCategoryCount; l++) {
00227         int u = l*4*kPaddedPatternCount;
00228         int w = l*4*OFFSET;
00229                 
00230         PREFETCH_MATRIX(2,matrices2,w);
00231         
00232         for (int k = 0; k < kPatternCount; k++) {
00233             
00234             const int state1 = states1[k];
00235             
00236             PREFETCH_PARTIALS(2,partials2,u);
00237                         
00238             DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23;
00239                         
00240             destP[u    ] = matrices1[w            + state1] * sum20;
00241             destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21;
00242             destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22;
00243             destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23;
00244             
00245             u += 4;
00246         }
00247     }
00248 }
00249 
00250 BEAGLE_CPU_TEMPLATE
00251 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP,
00252                                        const int* states1,
00253                                        const REALTYPE* matrices1,
00254                                        const REALTYPE* partials2,
00255                                        const REALTYPE* matrices2,
00256                                        const REALTYPE* scaleFactors) {
00257     
00258 #pragma omp parallel for num_threads(kCategoryCount)
00259     for (int l = 0; l < kCategoryCount; l++) {
00260         int u = l*4*kPaddedPatternCount;
00261         int w = l*4*OFFSET;
00262                 
00263         PREFETCH_MATRIX(2,matrices2,w);
00264         
00265         for (int k = 0; k < kPatternCount; k++) {
00266             
00267             const int state1 = states1[k];
00268             const REALTYPE scaleFactor = scaleFactors[k];
00269             
00270             PREFETCH_PARTIALS(2,partials2,u);
00271             
00272             DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
00273             
00274             destP[u    ] = matrices1[w            + state1] * sum20 / scaleFactor;
00275             destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21 / scaleFactor;
00276             destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22 / scaleFactor;
00277             destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23 / scaleFactor;
00278             
00279             u += 4;            
00280         }
00281     }   
00282 }
00283 
00284 BEAGLE_CPU_TEMPLATE
00285 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP,
00286                                          const REALTYPE* partials1,
00287                                          const REALTYPE* matrices1,
00288                                          const REALTYPE* partials2,
00289                                          const REALTYPE* matrices2) {
00290     
00291  
00292 #pragma omp parallel for num_threads(kCategoryCount)
00293     for (int l = 0; l < kCategoryCount; l++) {
00294         int u = l*4*kPaddedPatternCount;
00295         int w = l*4*OFFSET;
00296                 
00297         PREFETCH_MATRIX(1,matrices1,w);                
00298         PREFETCH_MATRIX(2,matrices2,w);
00299         for (int k = 0; k < kPatternCount; k++) {                   
00300             PREFETCH_PARTIALS(1,partials1,u);
00301             PREFETCH_PARTIALS(2,partials2,u);
00302             
00303             DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
00304             DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
00305             
00306             // Final results
00307             destP[u    ] = sum10 * sum20;
00308             destP[u + 1] = sum11 * sum21;
00309             destP[u + 2] = sum12 * sum22;
00310             destP[u + 3] = sum13 * sum23;
00311 
00312             u += 4;
00313 
00314         }
00315     }
00316 }
00317     
00318 BEAGLE_CPU_TEMPLATE
00319 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP,
00320                                                                     const REALTYPE* partials1,
00321                                                                     const REALTYPE* matrices1,
00322                                                                     const REALTYPE* partials2,
00323                                                                     const REALTYPE* matrices2,
00324                                                                     int* activateScaling) {
00325     
00326     
00327 #pragma omp parallel for num_threads(kCategoryCount)
00328     for (int l = 0; l < kCategoryCount; l++) {
00329         int u = l*4*kPaddedPatternCount;
00330         int w = l*4*OFFSET;
00331         
00332         PREFETCH_MATRIX(1,matrices1,w);                
00333         PREFETCH_MATRIX(2,matrices2,w);
00334         for (int k = 0; k < kPatternCount; k++) {                   
00335             PREFETCH_PARTIALS(1,partials1,u);
00336             PREFETCH_PARTIALS(2,partials2,u);
00337             
00338             DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
00339             DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
00340             
00341             // Final results
00342             destP[u    ] = sum10 * sum20;
00343             destP[u + 1] = sum11 * sum21;
00344             destP[u + 2] = sum12 * sum22;
00345             destP[u + 3] = sum13 * sum23;
00346             
00347             if (*activateScaling == 0) {
00348                 int expTmp;
00349                 int expMax;
00350                 frexp(destP[u], &expMax);
00351                 frexp(destP[u + 1], &expTmp);
00352                 if (abs(expTmp) > abs(expMax))
00353                     expMax = expTmp;
00354                 frexp(destP[u + 2], &expTmp);
00355                 if (abs(expTmp) > abs(expMax))
00356                     expMax = expTmp;
00357                 frexp(destP[u + 3], &expTmp);
00358                 if (abs(expTmp) > abs(expMax))
00359                     expMax = expTmp;
00360 
00361                 if(abs(expMax) > scalingExponentThreshhold) {
00362                     *activateScaling = 1;
00363                 }
00364             }
00365             
00366             u += 4;
00367             
00368         }
00369     }
00370 }
00371     
00372 
00373 BEAGLE_CPU_TEMPLATE
00374 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP,
00375                                          const REALTYPE* partials1,
00376                                          const REALTYPE* matrices1,
00377                                          const REALTYPE* partials2,
00378                                          const REALTYPE* matrices2,
00379                                          const REALTYPE* scaleFactors) {
00380     
00381 #pragma omp parallel for num_threads(kCategoryCount)
00382     for (int l = 0; l < kCategoryCount; l++) {
00383         int u = l*4*kPaddedPatternCount;
00384         int w = l*4*OFFSET;
00385         
00386         PREFETCH_MATRIX(1,matrices1,w);
00387         PREFETCH_MATRIX(2,matrices2,w);
00388         
00389         for (int k = 0; k < kPatternCount; k++) {
00390                         
00391             // Prefetch scale factor
00392             const REALTYPE scaleFactor = scaleFactors[k];
00393             
00394             PREFETCH_PARTIALS(1,partials1,u);
00395             PREFETCH_PARTIALS(2,partials2,u);
00396             
00397             DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
00398             DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
00399                                 
00400             // Final results
00401             destP[u    ] = sum10 * sum20 / scaleFactor;
00402             destP[u + 1] = sum11 * sum21 / scaleFactor;
00403             destP[u + 2] = sum12 * sum22 / scaleFactor;
00404             destP[u + 3] = sum13 * sum23 / scaleFactor;
00405             
00406             u += 4;
00407         }
00408     }    
00409 }
00410 
00411 BEAGLE_CPU_TEMPLATE
00412 int inline BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::integrateOutStatesAndScale(const REALTYPE* integrationTmp,
00413                                                                       const int stateFrequenciesIndex,
00414                                                             const int scalingFactorsIndex,
00415                                                             double* outSumLogLikelihood) {
00416     
00417     int returnCode = BEAGLE_SUCCESS;
00418     
00419     register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'?
00420     freq0 = gStateFrequencies[stateFrequenciesIndex][0];   
00421     freq1 = gStateFrequencies[stateFrequenciesIndex][1];
00422     freq2 = gStateFrequencies[stateFrequenciesIndex][2];
00423     freq3 = gStateFrequencies[stateFrequenciesIndex][3];
00424     
00425     int u = 0;
00426     for(int k = 0; k < kPatternCount; k++) {
00427         REALTYPE sumOverI =
00428         freq0 * integrationTmp[u    ] +
00429         freq1 * integrationTmp[u + 1] +
00430         freq2 * integrationTmp[u + 2] +
00431         freq3 * integrationTmp[u + 3];
00432         
00433         u += 4;
00434                         
00435         outLogLikelihoodsTmp[k] = log(sumOverI);
00436     }        
00437 
00438     if (scalingFactorsIndex != BEAGLE_OP_NONE) {
00439         const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
00440         for(int k=0; k < kPatternCount; k++) {
00441             outLogLikelihoodsTmp[k] += scalingFactors[k];
00442         }
00443     }
00444     
00445     *outSumLogLikelihood = 0.0;    
00446     for(int k=0; k < kPatternCount; k++) {
00447         *outSumLogLikelihood += outLogLikelihoodsTmp[k] * gPatternWeights[k];
00448     }    
00449     
00450     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
00451         returnCode = BEAGLE_ERROR_FLOATING_POINT;
00452 
00453     
00454     return returnCode;
00455 }
00456 
00457 #define FAST_MAX(x,y)   (x > y ? x : y)
00458 //#define BEAGLE_TEST_OPTIMIZATION
00459 /*
00460  * Re-scales the partial likelihoods such that the largest is one.
00461  */
00462 BEAGLE_CPU_TEMPLATE
00463 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
00464                 REALTYPE* scaleFactors,
00465                 REALTYPE* cumulativeScaleFactors,
00466         const int  fillWithOnes) {
00467 
00468         bool useLogScalars = kFlags & BEAGLE_FLAG_SCALERS_LOG;
00469 
00470     for (int k = 0; k < kPatternCount; k++) {
00471         REALTYPE max = 0;       
00472         const int patternOffset = k * 4;
00473         for (int l = 0; l < kCategoryCount; l++) {
00474             int offset = l * kPaddedPatternCount * 4 + patternOffset;
00475 
00476 #ifdef BEAGLE_TEST_OPTIMIZATION
00477             REALTYPE max01 = FAST_MAX(destP[offset + 0], destP[offset + 1]);
00478             REALTYPE max23 = FAST_MAX(destP[offset + 2], destP[offset + 3]);
00479             max = FAST_MAX(max, max01);
00480             max = FAST_MAX(max, max23);
00481 #else
00482                         #pragma unroll
00483             for (int i = 0; i < 4; i++) {
00484                 if(destP[offset] > max)
00485                     max = destP[offset];
00486                 offset++;
00487             }
00488 #endif
00489         }
00490 
00491         if (max == 0)
00492             max = REALTYPE(1.0);
00493 
00494         REALTYPE oneOverMax = REALTYPE(1.0) / max;
00495         for (int l = 0; l < kCategoryCount; l++) {
00496             int offset = l * kPaddedPatternCount * 4 + patternOffset;
00497                         #pragma unroll
00498             for (int i = 0; i < 4; i++)
00499                 destP[offset++] *= oneOverMax;
00500         }
00501 
00502         if (useLogScalars) {
00503             REALTYPE logMax = log(max);
00504             scaleFactors[k] = logMax;
00505             if( cumulativeScaleFactors != NULL )
00506                 cumulativeScaleFactors[k] += logMax;
00507         } else {
00508             scaleFactors[k] = max;
00509             if( cumulativeScaleFactors != NULL )
00510                 cumulativeScaleFactors[k] += log(max);
00511         }
00512     }
00513 }
00514 
00515 BEAGLE_CPU_TEMPLATE
00516 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex,
00517                                                            const int childIndex,
00518                                                            const int probIndex,
00519                                                            const int categoryWeightsIndex,
00520                                                            const int stateFrequenciesIndex,
00521                                                            const int scalingFactorsIndex,
00522                                                            double* outSumLogLikelihood) {
00523     // TODO: implement derivatives for calculateEdgeLnL
00524     
00525     assert(parIndex >= kTipCount);
00526     
00527     const REALTYPE* partialsParent = gPartials[parIndex];
00528     const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
00529     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
00530 
00531     
00532     memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
00533     
00534     if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
00535       
00536         const int* statesChild = gTipStates[childIndex];    
00537         int v = 0; // Index for parent partials
00538         int w = 0;
00539         for(int l = 0; l < kCategoryCount; l++) {
00540             int u = 0; // Index in resulting product-partials (summed over categories)
00541             const REALTYPE weight = wt[l];
00542             for(int k = 0; k < kPatternCount; k++) {
00543                 
00544                 const int stateChild = statesChild[k]; 
00545                 
00546                 integrationTmp[u    ] += transMatrix[w            + stateChild] * partialsParent[v    ] * weight;                                               
00547                 integrationTmp[u + 1] += transMatrix[w + OFFSET*1 + stateChild] * partialsParent[v + 1] * weight;
00548                 integrationTmp[u + 2] += transMatrix[w + OFFSET*2 + stateChild] * partialsParent[v + 2] * weight;
00549                 integrationTmp[u + 3] += transMatrix[w + OFFSET*3 + stateChild] * partialsParent[v + 3] * weight;
00550                 
00551                 u += 4;
00552                 v += 4;                
00553             }
00554             w += OFFSET*4;
00555             if (kExtraPatterns)
00556                 v += 4 * kExtraPatterns;
00557         }
00558         
00559     } else { // Integrate against a partial at the child
00560         
00561         const REALTYPE* partialsChild = gPartials[childIndex];
00562                 #if 0//
00563         int v = 0;
00564                 #endif
00565         int w = 0;
00566         for(int l = 0; l < kCategoryCount; l++) {            
00567             int u = 0;
00568                         #if 1//
00569                         int v = l*kPaddedPatternCount*4;
00570                         #endif
00571             const REALTYPE weight = wt[l];
00572             
00573             PREFETCH_MATRIX(1,transMatrix,w);
00574             
00575             for(int k = 0; k < kPatternCount; k++) {                
00576                                  
00577                 const REALTYPE* partials1 = partialsChild;
00578                 
00579                 PREFETCH_PARTIALS(1,partials1,v);
00580                 
00581                 DO_INTEGRATION(1);
00582                 
00583                 integrationTmp[u    ] += sum10 * partialsParent[v    ] * weight;
00584                 integrationTmp[u + 1] += sum11 * partialsParent[v + 1] * weight;
00585                 integrationTmp[u + 2] += sum12 * partialsParent[v + 2] * weight;
00586                 integrationTmp[u + 3] += sum13 * partialsParent[v + 3] * weight;
00587                 
00588                 u += 4;
00589                 v += 4;
00590             } 
00591             w += OFFSET*4;
00592                         #if 0//
00593             if (kExtraPatterns)
00594                 v += 4 * kExtraPatterns;
00595                         #endif//
00596         }
00597     }
00598 
00599     return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood);
00600 }
00601 
00602 BEAGLE_CPU_TEMPLATE
00603 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex,
00604                                                            const int categoryWeightsIndex,
00605                                                            const int stateFrequenciesIndex,
00606                                                 const int scalingFactorsIndex,
00607                                                 double* outSumLogLikelihood) {
00608 
00609     const REALTYPE* rootPartials = gPartials[bufferIndex];
00610     assert(rootPartials);
00611     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
00612     
00613     int u = 0;
00614     int v = 0;
00615     const REALTYPE wt0 = wt[0];
00616     for (int k = 0; k < kPatternCount; k++) {
00617         integrationTmp[v    ] = rootPartials[v    ] * wt0;
00618         integrationTmp[v + 1] = rootPartials[v + 1] * wt0;
00619         integrationTmp[v + 2] = rootPartials[v + 2] * wt0;
00620         integrationTmp[v + 3] = rootPartials[v + 3] * wt0;
00621         v += 4;
00622     }
00623     for (int l = 1; l < kCategoryCount; l++) {
00624         u = 0;
00625         const REALTYPE wtl = wt[l];
00626         for (int k = 0; k < kPatternCount; k++) {
00627             integrationTmp[u    ] += rootPartials[v    ] * wtl;
00628             integrationTmp[u + 1] += rootPartials[v + 1] * wtl;
00629             integrationTmp[u + 2] += rootPartials[v + 2] * wtl;
00630             integrationTmp[u + 3] += rootPartials[v + 3] * wtl;
00631              
00632             u += 4;
00633             v += 4;
00634         }
00635                 v += 4 * kExtraPatterns;
00636     }
00637     
00638     return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood);
00639 }
00640 
00641 BEAGLE_CPU_TEMPLATE
00642 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices,
00643                                                                 const int* categoryWeightsIndices,
00644                                                                 const int* stateFrequenciesIndices,
00645                                                                 const int* scaleBufferIndices,
00646                                                                 int count,
00647                                                                 double* outSumLogLikelihood) {
00648     
00649     int returnCode = BEAGLE_SUCCESS;
00650     
00651     std::vector<int> indexMaxScale(kPatternCount);
00652     std::vector<REALTYPE> maxScaleFactor(kPatternCount);
00653     
00654     for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
00655         const int rootPartialIndex = bufferIndices[subsetIndex];
00656         const REALTYPE* rootPartials = gPartials[rootPartialIndex];
00657         const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
00658         const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
00659         int u = 0;
00660         int v = 0;
00661         
00662         const REALTYPE wt0 = wt[0];
00663         for (int k = 0; k < kPatternCount; k++) {
00664             integrationTmp[v    ] = rootPartials[v    ] * wt0;
00665             integrationTmp[v + 1] = rootPartials[v + 1] * wt0;
00666             integrationTmp[v + 2] = rootPartials[v + 2] * wt0;
00667             integrationTmp[v + 3] = rootPartials[v + 3] * wt0;
00668             v += 4;
00669         }
00670         for (int l = 1; l < kCategoryCount; l++) {
00671             u = 0;
00672             const REALTYPE wtl = wt[l];
00673             for (int k = 0; k < kPatternCount; k++) {
00674                 integrationTmp[u    ] += rootPartials[v    ] * wtl;
00675                 integrationTmp[u + 1] += rootPartials[v + 1] * wtl;
00676                 integrationTmp[u + 2] += rootPartials[v + 2] * wtl;
00677                 integrationTmp[u + 3] += rootPartials[v + 3] * wtl;
00678                 
00679                 u += 4;
00680                 v += 4;
00681             }
00682             v += 4 * kExtraPatterns;
00683         }
00684                 
00685         register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'?
00686         freq0 = frequencies[0];   
00687         freq1 = frequencies[1];
00688         freq2 = frequencies[2];
00689         freq3 = frequencies[3];
00690         
00691         u = 0;
00692         for (int k = 0; k < kPatternCount; k++) {
00693             REALTYPE sum = 
00694                 freq0 * integrationTmp[u    ] +
00695                 freq1 * integrationTmp[u + 1] +
00696                 freq2 * integrationTmp[u + 2] +
00697                 freq3 * integrationTmp[u + 3];
00698             
00699             u += 4;     
00700             
00701             // TODO: allow only some subsets to have scale indices
00702             if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
00703                 int cumulativeScalingFactorIndex;
00704                 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
00705                     cumulativeScalingFactorIndex = rootPartialIndex - kTipCount; 
00706                 else
00707                     cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
00708                 
00709                 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
00710                 
00711                 if (subsetIndex == 0) {
00712                     indexMaxScale[k] = 0;
00713                     maxScaleFactor[k] = cumulativeScaleFactors[k];
00714                     for (int j = 1; j < count; j++) {
00715                         REALTYPE tmpScaleFactor;
00716                         if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
00717                             tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k]; 
00718                         else
00719                             tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
00720                         
00721                         if (tmpScaleFactor > maxScaleFactor[k]) {
00722                             indexMaxScale[k] = j;
00723                             maxScaleFactor[k] = tmpScaleFactor;
00724                         }
00725                     }
00726                 }
00727                 
00728                 if (subsetIndex != indexMaxScale[k])
00729                     sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
00730             }
00731             
00732             if (subsetIndex == 0) {
00733                 outLogLikelihoodsTmp[k] = sum;
00734             } else if (subsetIndex == count - 1) {
00735                 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
00736                 
00737                 outLogLikelihoodsTmp[k] = log(tmpSum);
00738             } else {
00739                 outLogLikelihoodsTmp[k] += sum;
00740             }
00741         }
00742     }
00743     
00744     if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
00745         for(int i=0; i<kPatternCount; i++)
00746             outLogLikelihoodsTmp[i] += maxScaleFactor[i];
00747     }
00748     
00749     *outSumLogLikelihood = 0.0;
00750     for (int i = 0; i < kPatternCount; i++) {
00751         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
00752     }
00753     
00754     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
00755         returnCode = BEAGLE_ERROR_FLOATING_POINT;
00756 
00757     
00758     return returnCode;
00759 }
00760     
00761 
00762 BEAGLE_CPU_TEMPLATE
00763 const char* BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::getName() {
00764         return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>();
00765 }
00766 
00768 // BeagleCPUImplFactory public methods
00769 
00770 BEAGLE_CPU_FACTORY_TEMPLATE
00771 BeagleImpl* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
00772                                              int partialsBufferCount,
00773                                              int compactBufferCount,
00774                                              int stateCount,
00775                                              int patternCount,
00776                                              int eigenBufferCount,
00777                                              int matrixBufferCount,
00778                                              int categoryCount,
00779                                              int scaleBufferCount,
00780                                              int resourceNumber,
00781                                              long preferenceFlags,
00782                                              long requirementFlags,
00783                                              int* errorCode) {
00784 
00785     if (stateCount != 4) {
00786         return NULL;
00787     }
00788 
00789     BeagleImpl* impl = new BeagleCPU4StateImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>();
00790 
00791     try {
00792         if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
00793                                  patternCount, eigenBufferCount, matrixBufferCount,
00794                                  categoryCount,scaleBufferCount, resourceNumber,
00795                                  preferenceFlags, requirementFlags) == 0)
00796             return impl;
00797     }
00798     catch(...) {
00799         if (DEBUGGING_OUTPUT)
00800             std::cerr << "exception in initialize\n";
00801         delete impl;
00802         throw;
00803     }
00804 
00805     delete impl;
00806 
00807     return NULL;
00808 }
00809 
00810 BEAGLE_CPU_FACTORY_TEMPLATE
00811 const char* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
00812         return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>();
00813 }
00814 
00815 BEAGLE_CPU_FACTORY_TEMPLATE
00816 const long BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() {
00817     long flags =  BEAGLE_FLAG_COMPUTATION_SYNCH |
00818                   BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
00819                   BEAGLE_FLAG_THREADING_NONE |
00820                   BEAGLE_FLAG_PROCESSOR_CPU |
00821                   BEAGLE_FLAG_VECTOR_NONE |
00822                   BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
00823                   BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
00824                   BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00825     
00826     if (DOUBLE_PRECISION)
00827         flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
00828     else
00829         flags |= BEAGLE_FLAG_PRECISION_SINGLE;
00830     return flags;
00831 }
00832 
00833 }       // namespace cpu
00834 }       // namespace beagle
00835 
00836 #endif // BEAGLE_CPU_4STATE_IMPL_HPP
00837