HMSBEAGLE  1.0.0
libhmsbeagle/CPU/BeagleCPUImpl.hpp
00001 /*
00002  *  BeagleCPUImpl.cpp
00003  *  BEAGLE
00004  *
00005  * Copyright 2009 Phylogenetic Likelihood Working Group
00006  *
00007  * This file is part of BEAGLE.
00008  *
00009  * BEAGLE is free software: you can redistribute it and/or modify
00010  * it under the terms of the GNU Lesser General Public License as
00011  * published by the Free Software Foundation, either version 3 of
00012  * the License, or (at your option) any later version.
00013  *
00014  * BEAGLE is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with BEAGLE.  If not, see
00021  * <http://www.gnu.org/licenses/>.
00022  *
00023  * @author Andrew Rambaut
00024  * @author Marc Suchard
00025  * @author Daniel Ayres
00026  * @author Mark Holder
00027  */
00028 
00030 //      so that we can flag them. This would this would be helpful for
00031 //      implementing:
00032 //          1. an error-checking version that double-checks (to the extent
00033 //              possible) that the client is using the API correctly.  This would
00034 //              ideally be a  conditional compilation variant (so that we do
00035 //              not normally incur runtime penalties, but can enable it to help
00036 //              find bugs).
00037 //          2. a multithreading impl that checks dependencies before queuing
00038 //              partials.
00039 
00041 //      would be trivial for this impl, and would be easier for clients that want
00042 //      to cache partial like calculations for a indeterminate number of trees.
00044 //  void waitForPartials(int* instance;
00045 //                  int instanceCount;
00046 //                  int* parentPartialIndex;
00047 //                  int partialCount;
00048 //                  );
00049 //  method that blocks until the partials are valid would be important for
00050 //  clients (such as GARLI) that deal with big trees by overwriting some temporaries.
00052 //  but MTH did want to record it for posterity). We could add following
00053 //  calls:
00055 // BeagleReturnCodes swapEigens(int instance, int *firstInd, int *secondInd, int count);
00056 // BeagleReturnCodes swapTransitionMatrices(int instance, int *firstInd, int *secondInd, int count);
00057 // BeagleReturnCodes swapPartials(int instance, int *firstInd, int *secondInd, int count);
00059 //  They would be optional for the client but could improve efficiency if:
00060 //      1. The impl is load balancing, AND
00061 //      2. The client code, uses the calls to synchronize the indexing of temporaries
00062 //          between instances such that you can pass an instanceIndices list with
00063 //          multiple entries to updatePartials.
00064 //  These seem too nitty gritty and low-level, but they also make it easy to
00065 //      write a wrapper geared toward MCMC (during a move, cache the old data
00066 //      in an unused array, after a rejection swap back to the cached copy)
00067 
00068 #ifndef BEAGLE_CPU_IMPL_GENERAL_HPP
00069 #define BEAGLE_CPU_IMPL_GENERAL_HPP
00070 
00071 #ifdef HAVE_CONFIG_H
00072 #include "libhmsbeagle/config.h"
00073 #endif
00074 
00075 #include <cstdio>
00076 #include <cstdlib>
00077 #include <iostream>
00078 #include <cstring>
00079 #include <cmath>
00080 #include <cassert>
00081 #include <vector>
00082 #include <cfloat>
00083 
00084 #include "libhmsbeagle/beagle.h"
00085 #include "libhmsbeagle/CPU/Precision.h"
00086 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
00087 #include "libhmsbeagle/CPU/EigenDecompositionCube.h"
00088 #include "libhmsbeagle/CPU/EigenDecompositionSquare.h"
00089 
00090 namespace beagle {
00091 namespace cpu {
00092 
00093 //#if defined (BEAGLE_IMPL_DEBUGGING_OUTPUT) && BEAGLE_IMPL_DEBUGGING_OUTPUT
00094 //const bool DEBUGGING_OUTPUT = true;
00095 //#else
00096 //const bool DEBUGGING_OUTPUT = false;
00097 //#endif
00098 
00099 BEAGLE_CPU_FACTORY_TEMPLATE
00100 inline const char* getBeagleCPUName(){ return "CPU-Unknown"; };
00101 
00102 template<>
00103 inline const char* getBeagleCPUName<double>(){ return "CPU-Double"; };
00104 
00105 template<>
00106 inline const char* getBeagleCPUName<float>(){ return "CPU-Single"; };
00107 
00108 BEAGLE_CPU_FACTORY_TEMPLATE
00109 inline const long getBeagleCPUFlags(){ return BEAGLE_FLAG_COMPUTATION_SYNCH; };
00110 
00111 template<>
00112 inline const long getBeagleCPUFlags<double>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
00113                                                       BEAGLE_FLAG_THREADING_NONE |
00114                                                       BEAGLE_FLAG_PROCESSOR_CPU |
00115                                                       BEAGLE_FLAG_PRECISION_DOUBLE |
00116                                                       BEAGLE_FLAG_VECTOR_NONE; };
00117 
00118 template<>
00119 inline const long getBeagleCPUFlags<float>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
00120                                                      BEAGLE_FLAG_THREADING_NONE |
00121                                                      BEAGLE_FLAG_PROCESSOR_CPU |
00122                                                      BEAGLE_FLAG_PRECISION_SINGLE |
00123                                                      BEAGLE_FLAG_VECTOR_NONE; };
00124 
00125 
00126 
00127 BEAGLE_CPU_TEMPLATE
00128 BeagleCPUImpl<BEAGLE_CPU_GENERIC>::~BeagleCPUImpl() {
00129     // free all that stuff...
00130     // If you delete partials, make sure not to delete the last element
00131     // which is TEMP_SCRATCH_PARTIAL twice.
00132 
00133     for(unsigned int i=0; i<kEigenDecompCount; i++) {
00134             if (gCategoryWeights[i] != NULL)
00135                     free(gCategoryWeights[i]);
00136         if (gStateFrequencies[i] != NULL)
00137                     free(gStateFrequencies[i]);
00138         }
00139 
00140         for(unsigned int i=0; i<kMatrixCount; i++) {
00141             if (gTransitionMatrices[i] != NULL)
00142                     free(gTransitionMatrices[i]);
00143         }
00144     free(gTransitionMatrices);
00145 
00146         for(unsigned int i=0; i<kBufferCount; i++) {
00147             if (gPartials[i] != NULL)
00148                     free(gPartials[i]);
00149             if (gTipStates[i] != NULL)
00150                     free(gTipStates[i]);
00151         }
00152     free(gPartials);
00153     free(gTipStates);
00154     
00155     if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
00156         for(unsigned int i=0; i<kScaleBufferCount; i++) {
00157             if (gAutoScaleBuffers[i] != NULL)
00158                 free(gAutoScaleBuffers[i]);
00159         }
00160         if (gAutoScaleBuffers)
00161             free(gAutoScaleBuffers);
00162         free(gActiveScalingFactors);
00163         if (gScaleBuffers[0] != NULL)
00164             free(gScaleBuffers[0]);
00165     } else {
00166         for(unsigned int i=0; i<kScaleBufferCount; i++) {
00167             if (gScaleBuffers[i] != NULL)
00168                 free(gScaleBuffers[i]);
00169         }        
00170     }
00171     
00172     if (gScaleBuffers)
00173         free(gScaleBuffers);
00174 
00175         free(gCategoryRates);
00176     free(gPatternWeights);
00177 
00178         free(integrationTmp);
00179     free(firstDerivTmp);
00180     free(secondDerivTmp);
00181 
00182     free(outLogLikelihoodsTmp);
00183     free(outFirstDerivativesTmp);
00184     free(outSecondDerivativesTmp);
00185 
00186         free(ones);
00187         free(zeros);
00188 
00189         delete gEigenDecomposition;
00190 }
00191 
00192 BEAGLE_CPU_TEMPLATE
00193 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::createInstance(int tipCount,
00194                                   int partialsBufferCount,
00195                                   int compactBufferCount,
00196                                   int stateCount,
00197                                   int patternCount,
00198                                   int eigenDecompositionCount,
00199                                   int matrixCount,
00200                                   int categoryCount,
00201                                   int scaleBufferCount,
00202                                   int resourceNumber,
00203                                   long preferenceFlags,
00204                                   long requirementFlags) {
00205     if (DEBUGGING_OUTPUT)
00206         std::cerr << "in BeagleCPUImpl::initialize\n" ;
00207 
00208     if (DOUBLE_PRECISION) {
00209         realtypeMin = DBL_MIN;
00210         scalingExponentThreshhold = 200;
00211     } else {
00212         realtypeMin = FLT_MIN;
00213         scalingExponentThreshhold = 20;
00214     }
00215 
00216     kBufferCount = partialsBufferCount + compactBufferCount;
00217     kTipCount = tipCount;
00218     assert(kBufferCount > kTipCount);
00219     kStateCount = stateCount;
00220     kPatternCount = patternCount;
00221     
00222     kInternalPartialsBufferCount = kBufferCount - kTipCount;
00223 
00224     kTransPaddedStateCount = kStateCount + T_PAD;    
00225     kPartialsPaddedStateCount = kStateCount + P_PAD;
00226     
00227     // Handle possible padding of pattern sites for vectorization
00228     int modulus = getPaddedPatternsModulus();
00229     kPaddedPatternCount = kPatternCount;
00230     int remainder = kPatternCount % modulus;
00231     if (remainder != 0) {
00232         kPaddedPatternCount += modulus - remainder;
00233     }
00234     kExtraPatterns = kPaddedPatternCount - kPatternCount;
00235 
00236     kMatrixCount = matrixCount;
00237     kEigenDecompCount = eigenDecompositionCount;
00238         kCategoryCount = categoryCount;
00239     kScaleBufferCount = scaleBufferCount;
00240 
00241     kMatrixSize = (T_PAD + kStateCount) * kStateCount;
00242 
00243     int scaleBufferSize = kPaddedPatternCount;
00244     
00245     kFlags = 0;
00246 
00247     if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
00248         kFlags |= BEAGLE_FLAG_SCALING_AUTO;
00249         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00250         kScaleBufferCount = kInternalPartialsBufferCount;
00251     } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
00252         kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
00253         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00254         kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
00255     } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
00256         kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
00257         kFlags |= BEAGLE_FLAG_SCALERS_RAW;
00258     } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
00259         kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
00260         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00261     } else {
00262         kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
00263         kFlags |= BEAGLE_FLAG_SCALERS_RAW;
00264     }
00265     
00266     if (requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX || preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
00267         kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
00268     else
00269         kFlags |= BEAGLE_FLAG_EIGEN_REAL;
00270 
00271     if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
00272         kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00273     else
00274         kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
00275     
00276     if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
00277         gEigenDecomposition = new EigenDecompositionSquare<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
00278                         kStateCount,kCategoryCount,kFlags);
00279     else
00280         gEigenDecomposition = new EigenDecompositionCube<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
00281                         kStateCount, kCategoryCount,kFlags);
00282 
00283         gCategoryRates = (double*) malloc(sizeof(double) * kCategoryCount);
00284         if (gCategoryRates == NULL)
00285                 throw std::bad_alloc();
00286 
00287         gPatternWeights = (double*) malloc(sizeof(double) * kPatternCount);
00288         if (gPatternWeights == NULL)
00289                 throw std::bad_alloc();
00290 
00291     // TODO: if pattern padding is implemented this will create problems with setTipPartials
00292     kPartialsSize = kPaddedPatternCount * kPartialsPaddedStateCount * kCategoryCount;
00293 
00294     gPartials = (REALTYPE**) malloc(sizeof(REALTYPE*) * kBufferCount);
00295     if (gPartials == NULL)
00296      throw std::bad_alloc();
00297 
00298     gStateFrequencies = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
00299     if (gStateFrequencies == NULL)
00300         throw std::bad_alloc();
00301 
00302     gCategoryWeights = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
00303     if (gCategoryWeights == NULL)
00304         throw std::bad_alloc();
00305 
00306     // assigning kBufferCount to this array so that we can just check if a tipStateBuffer is
00307     // allocated
00308     gTipStates = (int**) malloc(sizeof(int*) * kBufferCount);
00309     if (gTipStates == NULL)
00310         throw std::bad_alloc();
00311 
00312     for (int i = 0; i < kBufferCount; i++) {
00313         gPartials[i] = NULL;
00314         gTipStates[i] = NULL;
00315     }
00316 
00317     for (int i = kTipCount; i < kBufferCount; i++) {
00318         gPartials[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
00319         if (gPartials[i] == NULL)
00320             throw std::bad_alloc();
00321     }
00322 
00323     gScaleBuffers = NULL;
00324 
00325     gAutoScaleBuffers = NULL;
00326 
00327     if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
00328         gAutoScaleBuffers = (signed short**) malloc(sizeof(signed short*) * kScaleBufferCount);
00329         if (gAutoScaleBuffers == NULL)
00330             throw std::bad_alloc();        
00331         for (int i = 0; i < kScaleBufferCount; i++) {
00332             gAutoScaleBuffers[i] = (signed short*) malloc(sizeof(signed short) * scaleBufferSize);
00333             if (gAutoScaleBuffers[i] == 0L)
00334                 throw std::bad_alloc();
00335         }
00336         gActiveScalingFactors = (int*) malloc(sizeof(int) * kInternalPartialsBufferCount);
00337         gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*));
00338         gScaleBuffers[0] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
00339     } else {
00340         gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*) * kScaleBufferCount);
00341         if (gScaleBuffers == NULL)
00342             throw std::bad_alloc();
00343         
00344         for (int i = 0; i < kScaleBufferCount; i++) {
00345             gScaleBuffers[i] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
00346             
00347             if (gScaleBuffers[i] == 0L)
00348                 throw std::bad_alloc();
00349 
00350             
00351             if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
00352                 for (int j=0; j < scaleBufferSize; j++) {
00353                     gScaleBuffers[i][j] = 1.0;
00354                 }
00355             }
00356         }
00357     }
00358         
00359 
00360     gTransitionMatrices = (REALTYPE**) malloc(sizeof(REALTYPE*) * kMatrixCount);
00361     if (gTransitionMatrices == NULL)
00362         throw std::bad_alloc();
00363     for (int i = 0; i < kMatrixCount; i++) {
00364         gTransitionMatrices[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kMatrixSize * kCategoryCount);
00365         if (gTransitionMatrices[i] == 0L)
00366             throw std::bad_alloc();
00367     }
00368 
00369     integrationTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPatternCount * kStateCount);
00370     firstDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
00371     secondDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
00372 
00373     outLogLikelihoodsTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
00374     outFirstDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
00375     outSecondDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
00376 
00377     zeros = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
00378     ones = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
00379     for(int i = 0; i < kPaddedPatternCount; i++) {
00380         zeros[i] = 0.0;
00381         ones[i] = 1.0;
00382     }
00383 
00384     return BEAGLE_SUCCESS;
00385 }
00386 
00387 BEAGLE_CPU_TEMPLATE
00388 const char* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getName() {
00389         return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
00390 }
00391 
00392 BEAGLE_CPU_TEMPLATE
00393 const long BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getFlags() {
00394         return getBeagleCPUFlags<BEAGLE_CPU_FACTORY_GENERIC>();
00395 }
00396 
00397 BEAGLE_CPU_TEMPLATE
00398 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
00399     if (returnInfo != NULL) {
00400         returnInfo->resourceNumber = 0;
00401         returnInfo->flags = getFlags();
00402         returnInfo->flags |= kFlags;
00403 
00404         returnInfo->implName = (char*) getName();
00405     }
00406 
00407     return BEAGLE_SUCCESS;
00408 }
00409 
00410 BEAGLE_CPU_TEMPLATE
00411 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipStates(int tipIndex,
00412                                 const int* inStates) {
00413     if (tipIndex < 0 || tipIndex >= kTipCount)
00414         return BEAGLE_ERROR_OUT_OF_RANGE;
00415     gTipStates[tipIndex] = (int*) mallocAligned(sizeof(int) * kPaddedPatternCount);
00416     // TODO: What if this throws a memory full error?
00417         for (int j = 0; j < kPatternCount; j++) {
00418                 gTipStates[tipIndex][j] = (inStates[j] < kStateCount ? inStates[j] : kStateCount);
00419         }
00420         for (int j = kPatternCount; j < kPaddedPatternCount; j++) {
00421                 gTipStates[tipIndex][j] = kStateCount;
00422         }
00423 
00424     return BEAGLE_SUCCESS;
00425 }
00426 
00427 BEAGLE_CPU_TEMPLATE
00428 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipPartials(int tipIndex,
00429                                   const double* inPartials) {
00430     if (tipIndex < 0 || tipIndex >= kTipCount)
00431         return BEAGLE_ERROR_OUT_OF_RANGE;
00432     if(gPartials[tipIndex] == NULL) {
00433         gPartials[tipIndex] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
00434         // TODO: What if this throws a memory full error?
00435         if (gPartials[tipIndex] == 0L)
00436             return BEAGLE_ERROR_OUT_OF_MEMORY;
00437     }
00438 
00439     const double* inPartialsOffset;
00440     REALTYPE* tmpRealPartialsOffset = gPartials[tipIndex];
00441     for (int l = 0; l < kCategoryCount; l++) {
00442         inPartialsOffset = inPartials;
00443         for (int i = 0; i < kPatternCount; i++) {
00444                 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
00445             tmpRealPartialsOffset += kPartialsPaddedStateCount;
00446             inPartialsOffset += kStateCount;
00447         }
00448         // Pad extra buffer with zeros
00449         for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
00450                 *tmpRealPartialsOffset++ = 0;
00451         }
00452     }
00453 
00454     return BEAGLE_SUCCESS;
00455 }
00456 
00457 BEAGLE_CPU_TEMPLATE
00458 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPartials(int bufferIndex,
00459                                const double* inPartials) {
00460     if (bufferIndex < 0 || bufferIndex >= kBufferCount)
00461         return BEAGLE_ERROR_OUT_OF_RANGE;
00462     if (gPartials[bufferIndex] == NULL) {
00463         gPartials[bufferIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kPartialsSize);
00464         if (gPartials[bufferIndex] == 0L)
00465             return BEAGLE_ERROR_OUT_OF_MEMORY;
00466     }
00467     
00468     const double* inPartialsOffset = inPartials;
00469     REALTYPE* tmpRealPartialsOffset = gPartials[bufferIndex];
00470     for (int l = 0; l < kCategoryCount; l++) {
00471         for (int i = 0; i < kPatternCount; i++) {
00472                 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
00473             tmpRealPartialsOffset += kPartialsPaddedStateCount;
00474             inPartialsOffset += kStateCount;
00475         }
00476         // Pad extra buffer with zeros
00477         for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
00478                 *tmpRealPartialsOffset++ = 0;
00479         }
00480     }
00481 
00482     return BEAGLE_SUCCESS;
00483 }
00484 
00485 BEAGLE_CPU_TEMPLATE
00486 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPartials(int bufferIndex,
00487                                int cumulativeScaleIndex,
00488                                double* outPartials) {
00489     // TODO: Make this work with partials padding
00490     
00491         // TODO: Test with and without padding
00492     if (bufferIndex < 0 || bufferIndex >= kBufferCount)
00493         return BEAGLE_ERROR_OUT_OF_RANGE;
00494 
00495     if (kPatternCount == kPaddedPatternCount) {
00496         beagleMemCpy(outPartials, gPartials[bufferIndex], kPartialsSize);
00497     } else { // Need to remove padding
00498         double *offsetOutPartials;
00499         REALTYPE* offsetBeaglePartials = gPartials[bufferIndex];
00500         for(int i = 0; i < kCategoryCount; i++) {
00501                 beagleMemCpy(offsetOutPartials,offsetBeaglePartials,
00502                                 kPatternCount * kStateCount);
00503                 offsetOutPartials += kPatternCount * kStateCount;
00504                 offsetBeaglePartials += kPaddedPatternCount * kStateCount;
00505         }
00506     }
00507 
00508     if (cumulativeScaleIndex != BEAGLE_OP_NONE) {
00509         REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
00510         int index = 0;
00511         for(int k=0; k<kPatternCount; k++) {
00512                 REALTYPE scaleFactor = exp(cumulativeScaleBuffer[k]);
00513                 for(int i=0; i<kStateCount; i++) {
00514                         outPartials[index] *= scaleFactor;
00515                         index++;
00516                 }
00517         }
00518         // TODO: Do we assume the cumulativeScaleBuffer is on the log-scale?
00519     }
00520 
00521     return BEAGLE_SUCCESS;
00522 }
00523 
00524 BEAGLE_CPU_TEMPLATE
00525 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setEigenDecomposition(int eigenIndex,
00526                                          const double* inEigenVectors,
00527                                          const double* inInverseEigenVectors,
00528                                          const double* inEigenValues) {
00529 
00530         gEigenDecomposition->setEigenDecomposition(eigenIndex, inEigenVectors, inInverseEigenVectors, inEigenValues);
00531         return BEAGLE_SUCCESS;
00532 }
00533 
00534 BEAGLE_CPU_TEMPLATE
00535 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
00536         memcpy(gCategoryRates, inCategoryRates, sizeof(double) * kCategoryCount);
00537     return BEAGLE_SUCCESS;
00538 }
00539 
00540 BEAGLE_CPU_TEMPLATE
00541 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
00542     assert(inPatternWeights != 0L);
00543     memcpy(gPatternWeights, inPatternWeights, sizeof(double) * kPatternCount);
00544     return BEAGLE_SUCCESS;
00545 }
00546 
00547 BEAGLE_CPU_TEMPLATE
00548     int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
00549                                                      const double* inStateFrequencies) {
00550     if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
00551         return BEAGLE_ERROR_OUT_OF_RANGE;
00552     if (gStateFrequencies[stateFrequenciesIndex] == NULL) {
00553         gStateFrequencies[stateFrequenciesIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kStateCount);
00554         if (gStateFrequencies[stateFrequenciesIndex] == 0L)
00555             return BEAGLE_ERROR_OUT_OF_MEMORY;
00556     }
00557     beagleMemCpy(gStateFrequencies[stateFrequenciesIndex], inStateFrequencies, kStateCount);
00558 
00559     return BEAGLE_SUCCESS;
00560 }
00561 
00562 BEAGLE_CPU_TEMPLATE
00563 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
00564                                                  const double* inCategoryWeights) {
00565     if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
00566         return BEAGLE_ERROR_OUT_OF_RANGE;
00567     if (gCategoryWeights[categoryWeightsIndex] == NULL) {
00568         gCategoryWeights[categoryWeightsIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kCategoryCount);
00569         if (gCategoryWeights[categoryWeightsIndex] == 0L)
00570             return BEAGLE_ERROR_OUT_OF_MEMORY;
00571     }
00572     beagleMemCpy(gCategoryWeights[categoryWeightsIndex], inCategoryWeights, kCategoryCount);
00573 
00574     return BEAGLE_SUCCESS;
00575 }
00576 
00577 BEAGLE_CPU_TEMPLATE
00578 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getTransitionMatrix(int matrixIndex,
00579                                                                                                  double* outMatrix) {
00580         // TODO Test with multiple rate categories
00581 if (T_PAD != 0) {
00582         double* offsetOutMatrix = outMatrix;
00583         REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
00584         for(int i = 0; i < kCategoryCount; i++) {
00585                 for(int j = 0; j < kStateCount; j++) {
00586                         beagleMemCpy(offsetOutMatrix,offsetBeagleMatrix,kStateCount);
00587                         offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
00588                         offsetOutMatrix += kStateCount;
00589                 }
00590         }
00591 } else {
00592         beagleMemCpy(outMatrix,gTransitionMatrices[matrixIndex],
00593                         kMatrixSize * kCategoryCount);
00594 }
00595         return BEAGLE_SUCCESS;
00596 }
00597 
00598 BEAGLE_CPU_TEMPLATE
00599 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteLogLikelihoods(double* outLogLikelihoods) {
00600     beagleMemCpy(outLogLikelihoods, outLogLikelihoodsTmp, kPatternCount);
00601 
00602     return BEAGLE_SUCCESS;
00603 }
00604 
00605 BEAGLE_CPU_TEMPLATE
00606 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteDerivatives(double* outFirstDerivatives,
00607                                                 double* outSecondDerivatives) {
00608     beagleMemCpy(outFirstDerivatives, outFirstDerivativesTmp, kPatternCount);
00609     if (outSecondDerivatives != NULL)
00610         beagleMemCpy(outSecondDerivatives, outSecondDerivativesTmp, kPatternCount);
00611 
00612     return BEAGLE_SUCCESS;
00613 }
00614 
00615 
00616 BEAGLE_CPU_TEMPLATE
00617 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrix(int matrixIndex,
00618                                        const double* inMatrix,
00619                                        double paddedValue) {
00620 
00621 if (T_PAD != 0) {
00622     const double* offsetInMatrix = inMatrix;
00623     REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
00624     for(int i = 0; i < kCategoryCount; i++) {
00625         for(int j = 0; j < kStateCount; j++) {
00626             beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
00627             offsetBeagleMatrix[kStateCount] = paddedValue;
00628             offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
00629             offsetInMatrix += kStateCount;
00630         }
00631     }
00632 } else {
00633     beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
00634                  kMatrixSize * kCategoryCount);
00635 }
00636     return BEAGLE_SUCCESS;
00637 }
00638     
00639 BEAGLE_CPU_TEMPLATE
00640 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
00641                                                              const double* inMatrices,
00642                                                              const double* paddedValues,
00643                                                              int count) {
00644     for (int k = 0; k < count; k++) {
00645         const double* inMatrix = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
00646         int matrixIndex = matrixIndices[k];
00647         
00648 if (T_PAD != 0) {
00649         const double* offsetInMatrix = inMatrix;
00650         REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
00651         for(int i = 0; i < kCategoryCount; i++) {
00652             for(int j = 0; j < kStateCount; j++) {
00653                 beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
00654                 offsetBeagleMatrix[kStateCount] = paddedValues[k];
00655                 offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
00656                 offsetInMatrix += kStateCount;
00657             }
00658         }
00659 } else {
00660         beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
00661                      kMatrixSize * kCategoryCount);
00662 }
00663     }
00664     
00665     return BEAGLE_SUCCESS;
00666 }
00667 
00668 BEAGLE_CPU_TEMPLATE
00669 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
00670                                             const int* probabilityIndices,
00671                                             const int* firstDerivativeIndices,
00672                                             const int* secondDerivativeIndices,
00673                                             const double* edgeLengths,
00674                                             int count) {
00675         gEigenDecomposition->updateTransitionMatrices(eigenIndex,probabilityIndices,firstDerivativeIndices,secondDerivativeIndices,
00676                                                                                                   edgeLengths,gCategoryRates,gTransitionMatrices,count);
00677         return BEAGLE_SUCCESS;
00678 }
00679 
00680 BEAGLE_CPU_TEMPLATE
00681 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePartials(const int* operations,
00682                                   int count,
00683                                   int cumulativeScaleIndex) {
00684 
00685     REALTYPE* cumulativeScaleBuffer = NULL;
00686     if (cumulativeScaleIndex != BEAGLE_OP_NONE)
00687         cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
00688 
00689     for (int op = 0; op < count; op++) {
00690         if (DEBUGGING_OUTPUT) {
00691             std::cerr << "op[0]= " << operations[0] << "\n";
00692             std::cerr << "op[1]= " << operations[1] << "\n";
00693             std::cerr << "op[2]= " << operations[2] << "\n";
00694             std::cerr << "op[3]= " << operations[3] << "\n";
00695             std::cerr << "op[4]= " << operations[4] << "\n";
00696             std::cerr << "op[5]= " << operations[5] << "\n";
00697             std::cerr << "op[6]= " << operations[6] << "\n";
00698         }
00699 
00700         const int parIndex = operations[op * 7];
00701         const int writeScalingIndex = operations[op * 7 + 1];
00702         const int readScalingIndex = operations[op * 7 + 2];
00703         const int child1Index = operations[op * 7 + 3];
00704         const int child1TransMatIndex = operations[op * 7 + 4];
00705         const int child2Index = operations[op * 7 + 5];
00706         const int child2TransMatIndex = operations[op * 7 + 6];
00707 
00708         const REALTYPE* partials1 = gPartials[child1Index];
00709         const REALTYPE* partials2 = gPartials[child2Index];
00710 
00711         const int* tipStates1 = gTipStates[child1Index];
00712         const int* tipStates2 = gTipStates[child2Index];
00713 
00714         const REALTYPE* matrices1 = gTransitionMatrices[child1TransMatIndex];
00715         const REALTYPE* matrices2 = gTransitionMatrices[child2TransMatIndex];
00716 
00717         REALTYPE* destPartials = gPartials[parIndex];
00718 
00719         int rescale = BEAGLE_OP_NONE;
00720         REALTYPE* scalingFactors = NULL;
00721         
00722         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
00723             gActiveScalingFactors[parIndex - kTipCount] = 0;
00724             if (tipStates1 == 0 && tipStates2 == 0)
00725                 rescale = 2;
00726         } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
00727             rescale = 1;
00728             scalingFactors = gScaleBuffers[parIndex - kTipCount];
00729         } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { // TODO: this is a quick and dirty implementation just so it returns correct results
00730             if (tipStates1 == 0 && tipStates2 == 0) {
00731                 rescale = 1;
00732                 removeScaleFactors(&readScalingIndex, 1, cumulativeScaleIndex);
00733                 scalingFactors = gScaleBuffers[writeScalingIndex];
00734             }
00735         } else if (writeScalingIndex >= 0) {
00736             rescale = 1;
00737             scalingFactors = gScaleBuffers[writeScalingIndex];
00738         } else if (readScalingIndex >= 0) {
00739             rescale = 0;
00740             scalingFactors = gScaleBuffers[readScalingIndex];
00741         }
00742 
00743         if (DEBUGGING_OUTPUT) {
00744             std::cerr << "Rescale= " << rescale << " writeIndex= " << writeScalingIndex
00745                      << " readIndex = " << readScalingIndex << "\n";
00746         }
00747 
00748         if (tipStates1 != NULL) {
00749             if (tipStates2 != NULL ) {
00750                 if (rescale == 0) { // Use fixed scaleFactors
00751                     calcStatesStatesFixedScaling(destPartials, tipStates1, matrices1, tipStates2, matrices2,
00752                                                  scalingFactors);
00753                 } else {
00754                     // First compute without any scaling
00755                     calcStatesStates(destPartials, tipStates1, matrices1, tipStates2, matrices2);
00756                     if (rescale == 1) // Recompute scaleFactors
00757                         rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
00758                 }
00759             } else {
00760                 if (rescale == 0) {
00761                     calcStatesPartialsFixedScaling(destPartials, tipStates1, matrices1, partials2, matrices2,
00762                                                    scalingFactors);
00763                 } else {
00764                     calcStatesPartials(destPartials, tipStates1, matrices1, partials2, matrices2);
00765                     if (rescale == 1)
00766                         rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
00767                 }
00768             }
00769         } else {
00770             if (tipStates2 != NULL) {
00771                 if (rescale == 0) {
00772                     calcStatesPartialsFixedScaling(destPartials,tipStates2,matrices2,partials1,matrices1,
00773                                                    scalingFactors);
00774                 } else {
00775                     calcStatesPartials(destPartials, tipStates2, matrices2, partials1, matrices1);
00776                     if (rescale == 1)
00777                         rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
00778                 }
00779             } else {
00780                 if (rescale == 2) {
00781                     int sIndex = parIndex - kTipCount;
00782                     calcPartialsPartialsAutoScaling(destPartials,partials1,matrices1,partials2,matrices2,
00783                                                      &gActiveScalingFactors[sIndex]);
00784                     if (gActiveScalingFactors[sIndex])
00785                         autoRescalePartials(destPartials, gAutoScaleBuffers[sIndex]);
00786 
00787                 } else if (rescale == 0) {
00788                     calcPartialsPartialsFixedScaling(destPartials,partials1,matrices1,partials2,matrices2,
00789                                                      scalingFactors);
00790                 } else {
00791                     calcPartialsPartials(destPartials, partials1, matrices1, partials2, matrices2);
00792                     if (rescale == 1)
00793                         rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
00794                 }
00795             }
00796         }
00797         
00798         if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
00799             int parScalingIndex = parIndex - kTipCount;
00800             int child1ScalingIndex = child1Index - kTipCount;
00801             int child2ScalingIndex = child2Index - kTipCount;
00802             if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
00803                 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
00804                 accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
00805             } else if (child1ScalingIndex >= 0) {
00806                 int scalingIndices[1] = {child1ScalingIndex};
00807                 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
00808             } else if (child2ScalingIndex >= 0) {
00809                 int scalingIndices[1] = {child2ScalingIndex};
00810                 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
00811             }
00812         }
00813         
00814         if (DEBUGGING_OUTPUT) {
00815             if (scalingFactors != NULL && rescale == 0) {
00816                 for(int i=0; i<kPatternCount; i++)
00817                     fprintf(stderr,"old scaleFactor[%d] = %.5f\n",i,scalingFactors[i]);
00818             }
00819             fprintf(stderr,"Result partials:\n");
00820             for(int i = 0; i < kPartialsSize; i++)
00821                 fprintf(stderr,"destP[%d] = %.5f\n",i,destPartials[i]);
00822         }
00823     }
00824 
00825     return BEAGLE_SUCCESS;
00826 }
00827 
00828 
00829 BEAGLE_CPU_TEMPLATE
00830 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::waitForPartials(const int* destinationPartials,
00831                                    int destinationPartialsCount) {
00832     return BEAGLE_SUCCESS;
00833 }
00834 
00835 
00836 BEAGLE_CPU_TEMPLATE
00837     int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices,
00838                                                              const int* categoryWeightsIndices,
00839                                                              const int* stateFrequenciesIndices,
00840                                                              const int* cumulativeScaleIndices,
00841                                                              int count,
00842                                                              double* outSumLogLikelihood) {
00843 
00844     if (count == 1) {
00845         // We treat this as a special case so that we don't have convoluted logic
00846         //      at the end of the loop over patterns
00847         int cumulativeScalingFactorIndex;
00848         if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
00849             cumulativeScalingFactorIndex = 0;
00850         else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
00851             cumulativeScalingFactorIndex = bufferIndices[0] - kTipCount; 
00852         else
00853             cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
00854         return calcRootLogLikelihoods(bufferIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
00855                                cumulativeScalingFactorIndex, outSumLogLikelihood);
00856     }
00857     else
00858     {
00859         return calcRootLogLikelihoodsMulti(bufferIndices, categoryWeightsIndices, stateFrequenciesIndices,
00860                                     cumulativeScaleIndices, count, outSumLogLikelihood);
00861     }
00862 }
00863 
00864 BEAGLE_CPU_TEMPLATE
00865 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices,
00866                                                          const int* categoryWeightsIndices,
00867                                                          const int* stateFrequenciesIndices,
00868                                                          const int* scaleBufferIndices,
00869                                                          int count,
00870                                                          double* outSumLogLikelihood) {
00871     // Here we do the 3 similar operations:
00872     //              1. to set the lnL to the contribution of the first subset,
00873     //              2. to add the lnL for other subsets up to the penultimate
00874     //              3. to add the final subset and take the lnL
00875     //      This form of the calc would not work when count == 1 because
00876     //              we need operation 1 and 3 in the preceding list.  This is not
00877     //              a problem, though as we deal with count == 1 in the previous
00878     //              branch.
00879 
00880     std::vector<int> indexMaxScale(kPatternCount);
00881     std::vector<REALTYPE> maxScaleFactor(kPatternCount);
00882 
00883     int returnCode = BEAGLE_SUCCESS;
00884 
00885     for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
00886         const int rootPartialIndex = bufferIndices[subsetIndex];
00887         const REALTYPE* rootPartials = gPartials[rootPartialIndex];
00888         const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
00889         const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
00890         int u = 0;
00891         int v = 0;
00892         for (int k = 0; k < kPatternCount; k++) {
00893             for (int i = 0; i < kStateCount; i++) {
00894                 integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
00895                 u++;
00896                 v++;
00897             }
00898             v += P_PAD;
00899         }
00900         for (int l = 1; l < kCategoryCount; l++) {
00901             u = 0;
00902             for (int k = 0; k < kPatternCount; k++) {
00903                 for (int i = 0; i < kStateCount; i++) {
00904                     integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
00905                     u++;
00906                     v++;
00907                 }
00908                 v += P_PAD;
00909             }
00910         }
00911         u = 0;
00912         for (int k = 0; k < kPatternCount; k++) {
00913             REALTYPE sum = 0.0;
00914             for (int i = 0; i < kStateCount; i++) {
00915                 sum += ((REALTYPE)frequencies[i]) * integrationTmp[u];
00916                 u++;
00917             }
00918 
00919             // TODO: allow only some subsets to have scale indices
00920             if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
00921                 int cumulativeScalingFactorIndex;
00922                 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
00923                     cumulativeScalingFactorIndex = rootPartialIndex - kTipCount; 
00924                 else
00925                     cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
00926                 
00927                 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
00928 
00929                 if (subsetIndex == 0) {
00930                     indexMaxScale[k] = 0;
00931                     maxScaleFactor[k] = cumulativeScaleFactors[k];
00932                     for (int j = 1; j < count; j++) {
00933                         REALTYPE tmpScaleFactor;
00934                         if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
00935                             tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k]; 
00936                         else
00937                             tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
00938 
00939                         if (tmpScaleFactor > maxScaleFactor[k]) {
00940                             indexMaxScale[k] = j;
00941                             maxScaleFactor[k] = tmpScaleFactor;
00942                         }
00943                     }
00944                 }
00945 
00946                 if (subsetIndex != indexMaxScale[k])
00947                     sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
00948             }
00949 
00950             if (subsetIndex == 0) {
00951                 outLogLikelihoodsTmp[k] = sum;
00952             } else if (subsetIndex == count - 1) {
00953                 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
00954 
00955                 outLogLikelihoodsTmp[k] = log(tmpSum);
00956             } else {
00957                 outLogLikelihoodsTmp[k] += sum;
00958             }
00959         }
00960     }
00961 
00962     if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
00963         for(int i=0; i<kPatternCount; i++)
00964             outLogLikelihoodsTmp[i] += maxScaleFactor[i];
00965     }
00966 
00967     *outSumLogLikelihood = 0.0;
00968     for (int i = 0; i < kPatternCount; i++) {
00969         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
00970     }
00971 
00972     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
00973         returnCode = BEAGLE_ERROR_FLOATING_POINT;
00974     
00975     return returnCode;
00976 
00977 }
00978 
00979 BEAGLE_CPU_TEMPLATE
00980 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex,
00981                             const int categoryWeightsIndex,
00982                             const int stateFrequenciesIndex,
00983                             const int scalingFactorsIndex,
00984                             double* outSumLogLikelihood) {
00985 
00986     int returnCode = BEAGLE_SUCCESS;
00987 
00988     const REALTYPE* rootPartials = gPartials[bufferIndex];
00989     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
00990     const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
00991     int u = 0;
00992     int v = 0;
00993     for (int k = 0; k < kPatternCount; k++) {
00994         for (int i = 0; i < kStateCount; i++) {
00995             integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
00996             u++;
00997             v++;
00998         }
00999         v += P_PAD;
01000     }
01001     for (int l = 1; l < kCategoryCount; l++) {
01002         u = 0;
01003         for (int k = 0; k < kPatternCount; k++) {
01004             for (int i = 0; i < kStateCount; i++) {
01005                 integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
01006                 u++;
01007                 v++;
01008             }
01009             v += P_PAD;
01010         }
01011     }
01012     u = 0;
01013     for (int k = 0; k < kPatternCount; k++) {
01014         REALTYPE sum = 0.0;
01015         for (int i = 0; i < kStateCount; i++) {
01016             sum += freqs[i] * integrationTmp[u];
01017             u++;
01018         }
01019 
01020         outLogLikelihoodsTmp[k] = log(sum);
01021     }
01022 
01023     if (scalingFactorsIndex >= 0) {
01024         const REALTYPE* cumulativeScaleFactors = gScaleBuffers[scalingFactorsIndex];
01025         for(int i=0; i<kPatternCount; i++) {
01026                 outLogLikelihoodsTmp[i] += cumulativeScaleFactors[i];
01027         }
01028     }
01029 
01030     *outSumLogLikelihood = 0.0;
01031     for (int i = 0; i < kPatternCount; i++) {
01032         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
01033     }
01034 
01035     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
01036         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01037 
01038     
01039     // TODO: merge the three kPatternCount loops above into one
01040 
01041     return returnCode;
01042 }
01043 
01044 BEAGLE_CPU_TEMPLATE
01045 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
01046                                                 int  count,
01047                                                 int  cumulativeScalingIndex) {
01048     if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01049         REALTYPE* cumulativeScaleBuffer = gScaleBuffers[0];
01050         for(int j=0; j<kPatternCount; j++)
01051             cumulativeScaleBuffer[j] =  0;
01052         for(int i=0; i<count; i++) {
01053             int sIndex = scalingIndices[i] - kTipCount;
01054             if (gActiveScalingFactors[sIndex]) {
01055                 const signed short* scaleBuffer = gAutoScaleBuffers[sIndex];
01056                 for(int j=0; j<kPatternCount; j++) {
01057                     cumulativeScaleBuffer[j] += M_LN2 * scaleBuffer[j];
01058                 }
01059             }
01060         }
01061                 
01062     } else {
01063         REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
01064         for(int i=0; i<count; i++) {
01065             const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
01066             for(int j=0; j<kPatternCount; j++) {
01067                 if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
01068                     cumulativeScaleBuffer[j] += scaleBuffer[j];
01069                 else
01070                     cumulativeScaleBuffer[j] += log(scaleBuffer[j]);
01071             }
01072         }
01073 
01074         if (DEBUGGING_OUTPUT) {
01075             fprintf(stderr,"Accumulating %d scale buffers into #%d\n",count,cumulativeScalingIndex);
01076             for(int j=0; j<kPatternCount; j++) {
01077                 fprintf(stderr,"cumulativeScaleBuffer[%d] = %2.5e\n",j,cumulativeScaleBuffer[j]);
01078             }
01079         }
01080     }
01081     
01082     return BEAGLE_SUCCESS;
01083 }
01084 
01085 BEAGLE_CPU_TEMPLATE
01086 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
01087                                             int  count,
01088                                             int  cumulativeScalingIndex) {
01089         REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
01090     for(int i=0; i<count; i++) {
01091         const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
01092         for(int j=0; j<kPatternCount; j++) {
01093             if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
01094                 cumulativeScaleBuffer[j] -= scaleBuffer[j];
01095             else
01096                 cumulativeScaleBuffer[j] -= log(scaleBuffer[j]);
01097         }
01098     }
01099 
01100     return BEAGLE_SUCCESS;
01101 }
01102 
01103 BEAGLE_CPU_TEMPLATE
01104 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
01105     //memcpy(gScaleBuffers[cumulativeScalingIndex],zeros,sizeof(double) * kPatternCount);
01106         
01107          if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01108                  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(signed short) * kPaddedPatternCount);
01109          } else {               
01110                  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(REALTYPE) * kPaddedPatternCount);
01111          }
01112     return BEAGLE_SUCCESS;
01113 }
01114     
01115 BEAGLE_CPU_TEMPLATE
01116 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::copyScaleFactors(int destScalingIndex,
01117                                                         int srcScalingIndex) {
01118     memcpy(gScaleBuffers[destScalingIndex],gScaleBuffers[srcScalingIndex],sizeof(REALTYPE) * kPatternCount);
01119 
01120     return BEAGLE_SUCCESS;
01121 }
01122 
01123 BEAGLE_CPU_TEMPLATE
01124     int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices,
01125                                                              const int* childBufferIndices,
01126                                                              const int* probabilityIndices,
01127                                                              const int* firstDerivativeIndices,
01128                                                              const int* secondDerivativeIndices,
01129                                                              const int* categoryWeightsIndices,
01130                                                              const int* stateFrequenciesIndices,
01131                                                              const int* cumulativeScaleIndices,
01132                                                              int count,
01133                                                              double* outSumLogLikelihood,
01134                                                              double* outSumFirstDerivative,
01135                                                              double* outSumSecondDerivative) {
01136     // TODO: implement for count > 1
01137 
01138     if (count == 1) {
01139         int cumulativeScalingFactorIndex;
01140         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01141             cumulativeScalingFactorIndex = 0;
01142         } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01143             cumulativeScalingFactorIndex = kInternalPartialsBufferCount;
01144             int child1ScalingIndex = parentBufferIndices[0] - kTipCount;
01145             int child2ScalingIndex = childBufferIndices[0] - kTipCount;
01146             resetScaleFactors(cumulativeScalingFactorIndex);
01147             if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
01148                 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
01149                 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactorIndex);
01150             } else if (child1ScalingIndex >= 0) {
01151                 int scalingIndices[1] = {child1ScalingIndex};
01152                 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
01153             } else if (child2ScalingIndex >= 0) {
01154                 int scalingIndices[1] = {child2ScalingIndex};
01155                 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
01156             }
01157         } else {
01158             cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
01159         }
01160                 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL)
01161                         return calcEdgeLogLikelihoods(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
01162                                    categoryWeightsIndices[0], stateFrequenciesIndices[0], cumulativeScalingFactorIndex,
01163                                    outSumLogLikelihood);
01164                 else if (secondDerivativeIndices == NULL)
01165                         return calcEdgeLogLikelihoodsFirstDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
01166                                              firstDerivativeIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
01167                                              cumulativeScalingFactorIndex, outSumLogLikelihood, outSumFirstDerivative);
01168                 else
01169                         return calcEdgeLogLikelihoodsSecondDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
01170                                               firstDerivativeIndices[0], secondDerivativeIndices[0], categoryWeightsIndices[0],
01171                                               stateFrequenciesIndices[0], cumulativeScalingFactorIndex, outSumLogLikelihood,
01172                                               outSumFirstDerivative, outSumSecondDerivative);
01173     } else {
01174         if ((kFlags & BEAGLE_FLAG_SCALING_AUTO) || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
01175             fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and auto/always scaling\n");
01176         }
01177         
01178                 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
01179                         return calcEdgeLogLikelihoodsMulti(parentBufferIndices, childBufferIndices, probabilityIndices,
01180                                           categoryWeightsIndices, stateFrequenciesIndices, cumulativeScaleIndices, count,
01181                                           outSumLogLikelihood);
01182                 } else {
01183             fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
01184         }
01185             
01186 
01187     }
01188 
01189 }
01190 
01191 BEAGLE_CPU_TEMPLATE
01192 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex,
01193                                                                                                          const int childIndex,
01194                                                                                                          const int probIndex,
01195                                                      const int categoryWeightsIndex,
01196                                                      const int stateFrequenciesIndex,
01197                                                                                                          const int scalingFactorsIndex,
01198                                                      double* outSumLogLikelihood) {
01199 
01200         assert(parIndex >= kTipCount);
01201 
01202     int returnCode = BEAGLE_SUCCESS;
01203 
01204         const REALTYPE* partialsParent = gPartials[parIndex];
01205         const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
01206     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
01207     const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
01208 
01209         memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01210 
01211     
01212         if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
01213 
01214                 const int* statesChild = gTipStates[childIndex];
01215                 int v = 0; // Index for parent partials
01216 
01217                 for(int l = 0; l < kCategoryCount; l++) {
01218                         int u = 0; // Index in resulting product-partials (summed over categories)
01219                         const REALTYPE weight = wt[l];
01220                         for(int k = 0; k < kPatternCount; k++) {
01221 
01222                                 const int stateChild = statesChild[k];  // DISCUSSION PT: Does it make sense to change the order of the partials,
01223                                 // so we can interchange the patterCount and categoryCount loop order?
01224                                 int w =  l * kMatrixSize;
01225                                 for(int i = 0; i < kStateCount; i++) {
01226                                         integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
01227                                         u++;
01228 
01229                                         w += kTransPaddedStateCount;
01230                                 }
01231                                 v += kPartialsPaddedStateCount;
01232                         }
01233                 }
01234 
01235         } else { // Integrate against a partial at the child
01236 
01237         const REALTYPE* partialsChild = gPartials[childIndex];
01238         int v = 0;
01239         int stateCountModFour = (kStateCount / 4) * 4;
01240         
01241         for(int l = 0; l < kCategoryCount; l++) {
01242             int u = 0;
01243             const REALTYPE weight = wt[l];
01244             for(int k = 0; k < kPatternCount; k++) {
01245                 int w = l * kMatrixSize;
01246                 const REALTYPE* partialsChildPtr = &partialsChild[v];
01247                 for(int i = 0; i < kStateCount; i++) {
01248                     double sumOverJA = 0.0, sumOverJB = 0.0;
01249                     int j = 0;
01250                     const REALTYPE* transMatrixPtr = &transMatrix[w];
01251                     for (; j < stateCountModFour; j += 4) {
01252                         sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
01253                         sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
01254                         sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
01255                         sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
01256                         
01257                     }
01258                     for (; j < kStateCount; j++) {
01259                         sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
01260                     }
01261                     //                        for(int j = 0; j < kStateCount; j++) {
01262                     //                            sumOverJ += transMatrix[w] * partialsChild[v + j];
01263                     //                            w++;
01264                     //                        }
01265                     integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
01266                     u++;
01267                     
01268                     w += kStateCount;
01269 
01270                     // increment for the extra column at the end
01271                     w += T_PAD;
01272                 }
01273                 v += kPartialsPaddedStateCount;
01274             }
01275         }
01276     }
01277     
01278         int u = 0;
01279         for(int k = 0; k < kPatternCount; k++) {
01280                 REALTYPE sumOverI = 0.0;
01281                 for(int i = 0; i < kStateCount; i++) {
01282                         sumOverI += freqs[i] * integrationTmp[u];
01283                         u++;
01284                 }
01285 
01286         outLogLikelihoodsTmp[k] = log(sumOverI);
01287         }
01288 
01289 
01290         if (scalingFactorsIndex != BEAGLE_OP_NONE) {
01291                 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
01292                 for(int k=0; k < kPatternCount; k++)
01293                         outLogLikelihoodsTmp[k] += scalingFactors[k];
01294         }
01295 
01296     *outSumLogLikelihood = 0.0;
01297     for (int i = 0; i < kPatternCount; i++) {
01298         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
01299     }
01300 
01301     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
01302         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01303     
01304     return returnCode;
01305 }
01306 
01307 BEAGLE_CPU_TEMPLATE
01308 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsMulti(const int* parentBufferIndices,
01309                                                                    const int* childBufferIndices,
01310                                                                    const int* probabilityIndices,
01311                                                                    const int* categoryWeightsIndices,
01312                                                                    const int* stateFrequenciesIndices,
01313                                                                    const int* scalingFactorsIndices,
01314                                                                    int count,
01315                                                                    double* outSumLogLikelihood) {
01316 
01317     std::vector<int> indexMaxScale(kPatternCount);
01318     std::vector<REALTYPE> maxScaleFactor(kPatternCount);
01319     
01320     int returnCode = BEAGLE_SUCCESS;
01321     
01322     for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
01323         const REALTYPE* partialsParent = gPartials[parentBufferIndices[subsetIndex]];
01324         const REALTYPE* transMatrix = gTransitionMatrices[probabilityIndices[subsetIndex]];
01325         const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
01326         const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
01327         int childIndex = childBufferIndices[subsetIndex];
01328 
01329         memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01330         
01331         if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
01332             
01333             const int* statesChild = gTipStates[childIndex];
01334             int v = 0; // Index for parent partials
01335             
01336             for(int l = 0; l < kCategoryCount; l++) {
01337                 int u = 0; // Index in resulting product-partials (summed over categories)
01338                 const REALTYPE weight = wt[l];
01339                 for(int k = 0; k < kPatternCount; k++) {
01340                     
01341                     const int stateChild = statesChild[k];  // DISCUSSION PT: Does it make sense to change the order of the partials,
01342                     // so we can interchange the patterCount and categoryCount loop order?
01343                     int w =  l * kMatrixSize;
01344                     for(int i = 0; i < kStateCount; i++) {
01345                         integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
01346                         u++;
01347                         
01348                         w += kTransPaddedStateCount;
01349                     }
01350                     v += kPartialsPaddedStateCount;
01351                 }
01352             }                
01353         } else {
01354             const REALTYPE* partialsChild = gPartials[childIndex];
01355             int v = 0;
01356             int stateCountModFour = (kStateCount / 4) * 4;
01357             
01358             for(int l = 0; l < kCategoryCount; l++) {
01359                 int u = 0;
01360                 const REALTYPE weight = wt[l];
01361                 for(int k = 0; k < kPatternCount; k++) {
01362                     int w = l * kMatrixSize;
01363                     const REALTYPE* partialsChildPtr = &partialsChild[v];
01364                     for(int i = 0; i < kStateCount; i++) {
01365                         double sumOverJA = 0.0, sumOverJB = 0.0;
01366                         int j = 0;
01367                         const REALTYPE* transMatrixPtr = &transMatrix[w];
01368                         for (; j < stateCountModFour; j += 4) {
01369                             sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
01370                             sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
01371                             sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
01372                             sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
01373                             
01374                         }
01375                         for (; j < kStateCount; j++) {
01376                             sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
01377                         }
01378 //                        for(int j = 0; j < kStateCount; j++) {
01379 //                            sumOverJ += transMatrix[w] * partialsChild[v + j];
01380 //                            w++;
01381 //                        }
01382                         integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
01383                         u++;
01384                         
01385                         w += kStateCount;
01386 
01387                         // increment for the extra column at the end
01388                         w += T_PAD;
01389                     }
01390                     v += kPartialsPaddedStateCount;
01391                 }
01392             }
01393                             
01394         }
01395         int u = 0;
01396         for(int k = 0; k < kPatternCount; k++) {
01397             REALTYPE sumOverI = 0.0;
01398             for(int i = 0; i < kStateCount; i++) {
01399                 sumOverI += freqs[i] * integrationTmp[u];
01400                 u++;
01401             }            
01402             
01403             if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
01404                 int cumulativeScalingFactorIndex;
01405                 cumulativeScalingFactorIndex = scalingFactorsIndices[subsetIndex];
01406                 
01407                 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
01408                 
01409                 if (subsetIndex == 0) {
01410                     indexMaxScale[k] = 0;
01411                     maxScaleFactor[k] = cumulativeScaleFactors[k];
01412                     for (int j = 1; j < count; j++) {
01413                         REALTYPE tmpScaleFactor;
01414                         tmpScaleFactor = gScaleBuffers[scalingFactorsIndices[j]][k];
01415                         
01416                         if (tmpScaleFactor > maxScaleFactor[k]) {
01417                             indexMaxScale[k] = j;
01418                             maxScaleFactor[k] = tmpScaleFactor;
01419                         }
01420                     }
01421                 }
01422                 
01423                 if (subsetIndex != indexMaxScale[k])
01424                     sumOverI *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
01425             }
01426 
01427 
01428             
01429             if (subsetIndex == 0) {
01430                 outLogLikelihoodsTmp[k] = sumOverI;
01431             } else if (subsetIndex == count - 1) {
01432                 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sumOverI;
01433                 
01434                 outLogLikelihoodsTmp[k] = log(tmpSum);
01435             } else {
01436                 outLogLikelihoodsTmp[k] += sumOverI;
01437             }
01438                         
01439         }        
01440         
01441     }
01442     
01443     if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
01444         for(int i=0; i<kPatternCount; i++)
01445             outLogLikelihoodsTmp[i] += maxScaleFactor[i];
01446     }
01447     
01448 
01449     *outSumLogLikelihood = 0.0;
01450     for (int i = 0; i < kPatternCount; i++) {
01451         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
01452     }
01453     
01454     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
01455         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01456     
01457     return returnCode;
01458 }
01459 
01460     
01461 BEAGLE_CPU_TEMPLATE
01462 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsFirstDeriv(const int parIndex,
01463                                                                const int childIndex,
01464                                                                const int probIndex,
01465                                                                const int firstDerivativeIndex,
01466                                                                const int categoryWeightsIndex,
01467                                                                const int stateFrequenciesIndex,
01468                                                                const int scalingFactorsIndex,
01469                                                                double* outSumLogLikelihood,
01470                                                                double* outSumFirstDerivative) {
01471 
01472         assert(parIndex >= kTipCount);
01473 
01474     int returnCode = BEAGLE_SUCCESS;
01475 
01476         const REALTYPE* partialsParent = gPartials[parIndex];
01477         const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
01478         const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
01479     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
01480     const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
01481 
01482 
01483         memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01484         memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01485 
01486         if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
01487 
01488                 const int* statesChild = gTipStates[childIndex];
01489                 int v = 0; // Index for parent partials
01490 
01491                 for(int l = 0; l < kCategoryCount; l++) {
01492                         int u = 0; // Index in resulting product-partials (summed over categories)
01493                         const REALTYPE weight = wt[l];
01494                         for(int k = 0; k < kPatternCount; k++) {
01495 
01496                                 const int stateChild = statesChild[k];  // DISCUSSION PT: Does it make sense to change the order of the partials,
01497                                 // so we can interchange the patterCount and categoryCount loop order?
01498                                 int w =  l * kMatrixSize;
01499                                 for(int i = 0; i < kStateCount; i++) {
01500                                         integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
01501                                         firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
01502                                         u++;
01503 
01504                                         w += kTransPaddedStateCount;
01505                                 }
01506                                 v += kPartialsPaddedStateCount;
01507                         }
01508                 }
01509 
01510         } else { // Integrate against a partial at the child
01511 
01512                 const REALTYPE* partialsChild = gPartials[childIndex];
01513                 int v = 0;
01514 
01515                 for(int l = 0; l < kCategoryCount; l++) {
01516                         int u = 0;
01517                         const REALTYPE weight = wt[l];
01518                         for(int k = 0; k < kPatternCount; k++) {
01519                                 int w = l * kMatrixSize;
01520                                 for(int i = 0; i < kStateCount; i++) {
01521                                         double sumOverJ = 0.0;
01522                                         double sumOverJD1 = 0.0;
01523                                         for(int j = 0; j < kStateCount; j++) {
01524                                                 sumOverJ += transMatrix[w] * partialsChild[v + j];
01525                                                 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
01526                                                 w++;
01527                                         }
01528 
01529                                         // increment for the extra column at the end
01530                                         w += T_PAD;
01531 
01532                                         integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
01533                                         firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
01534                                         u++;
01535                                 }
01536                                 v += kPartialsPaddedStateCount;
01537                         }
01538                 }
01539         }
01540 
01541         int u = 0;
01542         for(int k = 0; k < kPatternCount; k++) {
01543                 REALTYPE sumOverI = 0.0;
01544                 REALTYPE sumOverID1 = 0.0;
01545                 for(int i = 0; i < kStateCount; i++) {
01546                         sumOverI += freqs[i] * integrationTmp[u];
01547                         sumOverID1 += freqs[i] * firstDerivTmp[u];
01548                         u++;
01549                 }
01550 
01551         outLogLikelihoodsTmp[k] = log(sumOverI);
01552                 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
01553         }
01554 
01555 
01556         if (scalingFactorsIndex != BEAGLE_OP_NONE) {
01557                 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
01558                 for(int k=0; k < kPatternCount; k++)
01559                         outLogLikelihoodsTmp[k] += scalingFactors[k];
01560         }
01561 
01562     *outSumLogLikelihood = 0.0;
01563     *outSumFirstDerivative = 0.0;
01564     for (int i = 0; i < kPatternCount; i++) {
01565         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
01566 
01567         *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
01568     }
01569     
01570     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
01571         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01572 
01573     return returnCode;
01574 }
01575 
01576 BEAGLE_CPU_TEMPLATE
01577 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsSecondDeriv(const int parIndex,
01578                                                                 const int childIndex,
01579                                                                 const int probIndex,
01580                                                                 const int firstDerivativeIndex,
01581                                                                 const int secondDerivativeIndex,
01582                                                                 const int categoryWeightsIndex,
01583                                                                 const int stateFrequenciesIndex,
01584                                                                 const int scalingFactorsIndex,
01585                                                                 double* outSumLogLikelihood,
01586                                                                 double* outSumFirstDerivative,
01587                                                                 double* outSumSecondDerivative) {
01588 
01589         assert(parIndex >= kTipCount);
01590 
01591     int returnCode = BEAGLE_SUCCESS;
01592 
01593         const REALTYPE* partialsParent = gPartials[parIndex];
01594         const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
01595         const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
01596         const REALTYPE* secondDerivMatrix = gTransitionMatrices[secondDerivativeIndex];
01597     const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
01598     const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
01599 
01600 
01601         memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01602         memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01603         memset(secondDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
01604 
01605         if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
01606 
01607                 const int* statesChild = gTipStates[childIndex];
01608                 int v = 0; // Index for parent partials
01609 
01610                 for(int l = 0; l < kCategoryCount; l++) {
01611                         int u = 0; // Index in resulting product-partials (summed over categories)
01612                         const REALTYPE weight = wt[l];
01613                         for(int k = 0; k < kPatternCount; k++) {
01614 
01615                                 const int stateChild = statesChild[k];  // DISCUSSION PT: Does it make sense to change the order of the partials,
01616                                 // so we can interchange the patterCount and categoryCount loop order?
01617                                 int w =  l * kMatrixSize;
01618                                 for(int i = 0; i < kStateCount; i++) {
01619                                         integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
01620                                         firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
01621                                         secondDerivTmp[u] += secondDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
01622                                         u++;
01623 
01624                                         w += kTransPaddedStateCount;
01625                                 }
01626                                 v += kPartialsPaddedStateCount;
01627                         }
01628                 }
01629 
01630         } else { // Integrate against a partial at the child
01631 
01632                 const REALTYPE* partialsChild = gPartials[childIndex];
01633                 int v = 0;
01634 
01635                 for(int l = 0; l < kCategoryCount; l++) {
01636                         int u = 0;
01637                         const REALTYPE weight = wt[l];
01638                         for(int k = 0; k < kPatternCount; k++) {
01639                                 int w = l * kMatrixSize;
01640                                 for(int i = 0; i < kStateCount; i++) {
01641                                         double sumOverJ = 0.0;
01642                                         double sumOverJD1 = 0.0;
01643                                         double sumOverJD2 = 0.0;
01644                                         for(int j = 0; j < kStateCount; j++) {
01645                                                 sumOverJ += transMatrix[w] * partialsChild[v + j];
01646                                                 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
01647                                                 sumOverJD2 += secondDerivMatrix[w] * partialsChild[v + j];
01648                                                 w++;
01649                                         }
01650 
01651                                         // increment for the extra column at the end
01652                                         w += T_PAD;
01653 
01654                                         integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
01655                                         firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
01656                                         secondDerivTmp[u] += sumOverJD2 * partialsParent[v + i] * weight;
01657                                         u++;
01658                                 }
01659                                 v += kPartialsPaddedStateCount;
01660                         }
01661                 }
01662         }
01663 
01664         int u = 0;
01665         for(int k = 0; k < kPatternCount; k++) {
01666                 REALTYPE sumOverI = 0.0;
01667                 REALTYPE sumOverID1 = 0.0;
01668                 REALTYPE sumOverID2 = 0.0;
01669                 for(int i = 0; i < kStateCount; i++) {
01670                         sumOverI += freqs[i] * integrationTmp[u];
01671                         sumOverID1 += freqs[i] * firstDerivTmp[u];
01672                         sumOverID2 += freqs[i] * secondDerivTmp[u];
01673                         u++;
01674                 }
01675 
01676         outLogLikelihoodsTmp[k] = log(sumOverI);
01677                 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
01678                 outSecondDerivativesTmp[k] = sumOverID2 / sumOverI - outFirstDerivativesTmp[k] * outFirstDerivativesTmp[k];
01679         }
01680 
01681 
01682         if (scalingFactorsIndex != BEAGLE_OP_NONE) {
01683                 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
01684                 for(int k=0; k < kPatternCount; k++)
01685                         outLogLikelihoodsTmp[k] += scalingFactors[k];
01686         }
01687 
01688     *outSumLogLikelihood = 0.0;
01689     *outSumFirstDerivative = 0.0;
01690     *outSumSecondDerivative = 0.0;
01691     for (int i = 0; i < kPatternCount; i++) {
01692         *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
01693 
01694         *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
01695 
01696         *outSumSecondDerivative += outSecondDerivativesTmp[i] * gPatternWeights[i];
01697     }
01698 
01699     if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
01700         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01701     
01702     return returnCode;
01703 }
01704 
01705 
01706 
01707 BEAGLE_CPU_TEMPLATE
01708 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::block(void) {
01709         // Do nothing.
01710         return BEAGLE_SUCCESS;
01711 }
01712 
01714 // private methods
01715 
01716 /*
01717  * Re-scales the partial likelihoods such that the largest is one.
01718  */
01719 BEAGLE_CPU_TEMPLATE
01720 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
01721                 REALTYPE* scaleFactors,
01722                 REALTYPE* cumulativeScaleFactors,
01723         const int  fillWithOnes) {
01724     if (DEBUGGING_OUTPUT) {
01725         std::cerr << "destP (before rescale): \n";// << destP << "\n";
01726         for(int i=0; i<kPartialsSize; i++)
01727             fprintf(stderr,"destP[%d] = %.5f\n",i,destP[i]);
01728     }
01729 
01730     // TODO None of the code below has been optimized.
01731     for (int k = 0; k < kPatternCount; k++) {
01732         REALTYPE max = 0;
01733         const int patternOffset = k * kPartialsPaddedStateCount;
01734         for (int l = 0; l < kCategoryCount; l++) {
01735             int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
01736             for (int i = 0; i < kStateCount; i++) {
01737                 if(destP[offset] > max)
01738                     max = destP[offset];
01739                 offset++;
01740             }
01741         }
01742         
01743         if (max == 0)
01744             max = 1.0;
01745                         
01746         REALTYPE oneOverMax = REALTYPE(1.0) / max;
01747         for (int l = 0; l < kCategoryCount; l++) {
01748             int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
01749             for (int i = 0; i < kStateCount; i++)
01750                 destP[offset++] *= oneOverMax;
01751         }
01752 
01753         if (kFlags & BEAGLE_FLAG_SCALERS_LOG) {
01754             REALTYPE logMax = log(max);
01755             scaleFactors[k] = logMax;
01756             if( cumulativeScaleFactors != NULL )
01757                 cumulativeScaleFactors[k] += logMax;
01758         } else {
01759             scaleFactors[k] = max;
01760             if( cumulativeScaleFactors != NULL )
01761                 cumulativeScaleFactors[k] += log(max);
01762         }
01763     }
01764     if (DEBUGGING_OUTPUT) {
01765         for(int i=0; i<kPatternCount; i++)
01766             fprintf(stderr,"new scaleFactor[%d] = %.5f\n",i,scaleFactors[i]);
01767     }
01768 }
01769     
01770 BEAGLE_CPU_TEMPLATE
01771 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::autoRescalePartials(REALTYPE* destP,
01772                                               signed short* scaleFactors) {
01773     
01774     
01775     for (int k = 0; k < kPatternCount; k++) {
01776         REALTYPE max = 0;
01777         const int patternOffset = k * kPartialsPaddedStateCount;
01778         for (int l = 0; l < kCategoryCount; l++) {
01779             int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
01780             for (int i = 0; i < kStateCount; i++) {
01781                 if(destP[offset] > max)
01782                     max = destP[offset];
01783                 offset++;
01784             }
01785         }
01786         
01787         int expMax;
01788         frexp(max, &expMax);
01789         scaleFactors[k] = expMax;
01790         
01791         if (expMax != 0) {
01792             for (int l = 0; l < kCategoryCount; l++) {
01793                 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
01794                 for (int i = 0; i < kStateCount; i++)
01795                     destP[offset++] *= pow(2.0, -expMax);
01796             }
01797         }
01798     }
01799 }
01800 
01801 /*
01802  * Calculates partial likelihoods at a node when both children have states.
01803  */
01804 BEAGLE_CPU_TEMPLATE
01805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
01806                                      const int* states1,
01807                                      const REALTYPE* matrices1,
01808                                      const int* states2,
01809                                      const REALTYPE* matrices2) {
01810 
01811 #pragma omp parallel for num_threads(kCategoryCount)
01812     for (int l = 0; l < kCategoryCount; l++) {
01813         int v = l*kPartialsPaddedStateCount*kPatternCount;
01814         for (int k = 0; k < kPatternCount; k++) {
01815             const int state1 = states1[k];
01816             const int state2 = states2[k];
01817             if (DEBUGGING_OUTPUT) {
01818                 std::cerr << "calcStatesStates s1 = " << state1 << '\n';
01819                 std::cerr << "calcStatesStates s2 = " << state2 << '\n';
01820             }
01821             int w = l * kMatrixSize;
01822             for (int i = 0; i < kStateCount; i++) {
01823                 destP[v] = matrices1[w + state1] * matrices2[w + state2];
01824                 v++;
01825 
01826                 w += kTransPaddedStateCount;
01827             }
01828             v += P_PAD;
01829         }
01830     }
01831 }
01832 
01833 BEAGLE_CPU_TEMPLATE
01834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
01835                                               const int* child1States,
01836                                            const REALTYPE* child1TransMat,
01837                                               const int* child2States,
01838                                            const REALTYPE* child2TransMat,
01839                                            const REALTYPE* scaleFactors) {
01840 #pragma omp parallel for num_threads(kCategoryCount)
01841     for (int l = 0; l < kCategoryCount; l++) {
01842         int v = l*kPartialsPaddedStateCount*kPatternCount;
01843         for (int k = 0; k < kPatternCount; k++) {
01844             const int state1 = child1States[k];
01845             const int state2 = child2States[k];
01846             int w = l * kMatrixSize;
01847             REALTYPE scaleFactor = scaleFactors[k];
01848             for (int i = 0; i < kStateCount; i++) {
01849                 destP[v] = child1TransMat[w + state1] *
01850                            child2TransMat[w + state2] / scaleFactor;
01851                 v++;
01852 
01853                 w += kTransPaddedStateCount;
01854             }
01855             v += P_PAD;
01856         }
01857     }
01858 }
01859 
01860 /*
01861  * Calculates partial likelihoods at a node when one child has states and one has partials.
01862  */
01863 BEAGLE_CPU_TEMPLATE
01864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
01865                                        const int* states1,
01866                                        const REALTYPE* matrices1,
01867                                        const REALTYPE* partials2,
01868                                        const REALTYPE* matrices2) {
01869     int matrixIncr = kStateCount;
01870 
01871     // increment for the extra column at the end
01872     matrixIncr += T_PAD;
01873 
01874 
01875         int stateCountModFour = (kStateCount / 4) * 4;
01876 
01877 #pragma omp parallel for num_threads(kCategoryCount)
01878     for (int l = 0; l < kCategoryCount; l++) {
01879         int v = l*kPartialsPaddedStateCount*kPatternCount;
01880         int matrixOffset = l*kMatrixSize;
01881         const REALTYPE* partials2Ptr = &partials2[v];
01882         REALTYPE* destPtr = &destP[v];
01883         for (int k = 0; k < kPatternCount; k++) {
01884             int w = l * kMatrixSize;
01885             int state1 = states1[k];
01886             for (int i = 0; i < kStateCount; i++) {
01887                 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
01888                 REALTYPE tmp = matrices1[w + state1];
01889                 REALTYPE sumA = 0.0;
01890                                 REALTYPE sumB = 0.0;                            
01891                                 int j = 0;
01892                 for (; j < stateCountModFour; j += 4) {
01893                     sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];                                  
01894                                         sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
01895                                         sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];                                      
01896                                         sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];                                      
01897                 }
01898                                 for (; j < kStateCount; j++) {
01899                                         sumA += matrices2Ptr[j] * partials2Ptr[j];                                      
01900                                 }                               
01901                 
01902                 w += matrixIncr;
01903                 
01904                 *(destPtr++) = tmp * (sumA + sumB);
01905             }
01906             destPtr += P_PAD;
01907             partials2Ptr += kPartialsPaddedStateCount;
01908         }
01909     }
01910 }
01911 
01912 BEAGLE_CPU_TEMPLATE
01913 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP,
01914                                                 const int* states1,
01915                                              const REALTYPE* matrices1,
01916                                              const REALTYPE* partials2,
01917                                              const REALTYPE* matrices2,
01918                                              const REALTYPE* scaleFactors) {
01919     int matrixIncr = kStateCount;
01920 
01921     // increment for the extra column at the end
01922     matrixIncr += T_PAD;
01923 
01924         int stateCountModFour = (kStateCount / 4) * 4;
01925 
01926 #pragma omp parallel for num_threads(kCategoryCount)
01927     for (int l = 0; l < kCategoryCount; l++) {
01928         int v = l*kPartialsPaddedStateCount*kPatternCount;
01929         int matrixOffset = l*kMatrixSize;
01930         const REALTYPE* partials2Ptr = &partials2[v];
01931         REALTYPE* destPtr = &destP[v];
01932         for (int k = 0; k < kPatternCount; k++) {
01933             int w = l * kMatrixSize;
01934             int state1 = states1[k];
01935                         REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
01936             for (int i = 0; i < kStateCount; i++) {
01937                 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
01938                 REALTYPE tmp = matrices1[w + state1];
01939                 REALTYPE sumA = 0.0;
01940                                 REALTYPE sumB = 0.0;                            
01941                                 int j = 0;
01942                 for (; j < stateCountModFour; j += 4) {
01943                     sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];                                  
01944                                         sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
01945                                         sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];                                      
01946                                         sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];                                      
01947                 }
01948                                 for (; j < kStateCount; j++) {
01949                                         sumA += matrices2Ptr[j] * partials2Ptr[j];                                      
01950                                 }                               
01951                 
01952                 w += matrixIncr;
01953                 
01954                 *(destPtr++) = tmp * (sumA + sumB) * oneOverScaleFactor;
01955             }
01956                         destPtr += P_PAD;
01957             partials2Ptr += kPartialsPaddedStateCount;
01958         }
01959     }                                                                                    
01960 }
01961 
01962 /*
01963  * Calculates partial likelihoods at a node when both children have partials.
01964  */
01965 BEAGLE_CPU_TEMPLATE
01966 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP,
01967                                          const REALTYPE* partials1,
01968                                          const REALTYPE* matrices1,
01969                                          const REALTYPE* partials2,
01970                                          const REALTYPE* matrices2) {
01971     int matrixIncr = kStateCount;
01972 
01973     // increment for the extra column at the end
01974     matrixIncr += T_PAD;
01975 
01976         int stateCountModFour = (kStateCount / 4) * 4;
01977 
01978 #pragma omp parallel for num_threads(kCategoryCount)
01979     for (int l = 0; l < kCategoryCount; l++) {
01980         int v = l*kPartialsPaddedStateCount*kPatternCount;
01981         int matrixOffset = l*kMatrixSize;
01982         const REALTYPE* partials1Ptr = &partials1[v];
01983         const REALTYPE* partials2Ptr = &partials2[v];
01984         REALTYPE* destPtr = &destP[v];
01985         for (int k = 0; k < kPatternCount; k++) {
01986 
01987             for (int i = 0; i < kStateCount; i++) {
01988                 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
01989                 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
01990                 REALTYPE sum1A = 0.0, sum2A = 0.0;
01991                                 REALTYPE sum1B = 0.0, sum2B = 0.0;
01992                                 int j = 0;
01993                                 for (; j < stateCountModFour; j += 4) {
01994                                         sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
01995                                         sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
01996                                         
01997                                         sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
01998                                         sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
01999 
02000                                         sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
02001                                         sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
02002                                         
02003                                         sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
02004                                         sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];                                                                             
02005                                 }                               
02006                                 
02007                 for (; j < kStateCount; j++) {
02008                     sum1A += matrices1Ptr[j] * partials1Ptr[j];
02009                     sum2A += matrices2Ptr[j] * partials2Ptr[j];
02010                 }
02011 
02012                 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B);
02013             }
02014             destPtr += P_PAD;
02015             partials1Ptr += kPartialsPaddedStateCount;
02016             partials2Ptr += kPartialsPaddedStateCount;
02017         }
02018     }
02019 }
02020     
02021 BEAGLE_CPU_TEMPLATE
02022 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP,
02023                                                const REALTYPE* partials1,
02024                                                const REALTYPE* matrices1,
02025                                                const REALTYPE* partials2,
02026                                                const REALTYPE* matrices2,
02027                                                const REALTYPE* scaleFactors) {
02028     int matrixIncr = kStateCount;
02029 
02030     // increment for the extra column at the end
02031     matrixIncr += T_PAD;
02032 
02033         int stateCountModFour = (kStateCount / 4) * 4;
02034     
02035 #pragma omp parallel for num_threads(kCategoryCount)
02036     for (int l = 0; l < kCategoryCount; l++) {
02037         int v = l*kPartialsPaddedStateCount*kPatternCount;
02038         int matrixOffset = l*kMatrixSize;
02039         const REALTYPE* partials1Ptr = &partials1[v];
02040         const REALTYPE* partials2Ptr = &partials2[v];
02041         REALTYPE* destPtr = &destP[v];
02042         for (int k = 0; k < kPatternCount; k++) {
02043             REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
02044             for (int i = 0; i < kStateCount; i++) {
02045                 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
02046                 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
02047                 REALTYPE sum1A = 0.0, sum2A = 0.0;
02048                                 REALTYPE sum1B = 0.0, sum2B = 0.0;
02049                                 int j = 0;
02050                                 for (; j < stateCountModFour; j += 4) {
02051                                         sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
02052                                         sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
02053                                         
02054                                         sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
02055                                         sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
02056 
02057                                         sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
02058                                         sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
02059                                         
02060                                         sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
02061                                         sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];                                                                             
02062                                 }                               
02063                                 
02064                 for (; j < kStateCount; j++) {
02065                     sum1A += matrices1Ptr[j] * partials1Ptr[j];
02066                     sum2A += matrices2Ptr[j] * partials2Ptr[j];
02067                 }
02068 
02069                 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B) * oneOverScaleFactor;
02070             }
02071                         destPtr += P_PAD;
02072                         partials1Ptr += kPartialsPaddedStateCount;
02073             partials2Ptr += kPartialsPaddedStateCount;
02074         }
02075     }
02076 }
02077     
02078 BEAGLE_CPU_TEMPLATE
02079 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP,
02080                                                                const REALTYPE* partials1,
02081                                                                const REALTYPE* matrices1,
02082                                                                const REALTYPE* partials2,
02083                                                                const REALTYPE* matrices2,
02084                                                                int* activateScaling) {
02085     
02086 #pragma omp parallel for num_threads(kCategoryCount)
02087     for (int l = 0; l < kCategoryCount; l++) {
02088         int u = l*kPartialsPaddedStateCount*kPatternCount;
02089         int v = l*kPartialsPaddedStateCount*kPatternCount;
02090         for (int k = 0; k < kPatternCount; k++) {
02091             int w = l * kMatrixSize;
02092             for (int i = 0; i < kStateCount; i++) {
02093                 REALTYPE sum1 = 0.0, sum2 = 0.0;
02094                 for (int j = 0; j < kStateCount; j++) {
02095                     sum1 += matrices1[w] * partials1[v + j];
02096                     sum2 += matrices2[w] * partials2[v + j];
02097                     w++;
02098                 }
02099 
02100                 // increment for the extra column at the end
02101                 w += T_PAD;
02102 
02103                 destP[u] = sum1 * sum2;
02104 
02105                 if (*activateScaling == 0) {
02106                     int expTmp;
02107                     frexp(destP[u], &expTmp);
02108                     if (abs(expTmp) > scalingExponentThreshhold) 
02109                         *activateScaling = 1;
02110                 }
02111                 
02112                 u++;
02113             }
02114             u += P_PAD;
02115             v += kPartialsPaddedStateCount;
02116         }
02117     }
02118 }
02119 
02120 BEAGLE_CPU_TEMPLATE
02121 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPaddedPatternsModulus() {
02122         // Padding only necessary for SSE implementations that vectorize across patterns
02123         return 1;  // No padding
02124 }
02125 
02126 BEAGLE_CPU_TEMPLATE
02127 void* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::mallocAligned(size_t size) {
02128         void *ptr = (void *) NULL;
02129 
02130 #if defined (__APPLE__) || defined(WIN32)
02131         /*
02132          presumably malloc on OS X always returns
02133          a 16-byte aligned pointer
02134          */
02135         /* Windows malloc() always gives 16-byte alignment */    
02136         assert(size > 0);
02137         ptr = malloc(size);
02138         if(ptr == (void*)NULL) {
02139                 assert(0);
02140         }
02141 #else
02142     #if (T_PAD == 1)    
02143         const size_t align = 32;
02144     #else // T_PAD == 2
02145         const size_t align = 16;
02146     #endif
02147         int res;
02148         res = posix_memalign(&ptr, align, size);
02149         if (res != 0) {
02150                 assert(0);
02151         }
02152 #endif
02153 
02154         return ptr;
02155 }
02156 
02158 // BeagleCPUImplFactory public methods
02159 BEAGLE_CPU_FACTORY_TEMPLATE
02160 BeagleImpl* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
02161                                              int partialsBufferCount,
02162                                              int compactBufferCount,
02163                                              int stateCount,
02164                                              int patternCount,
02165                                              int eigenBufferCount,
02166                                              int matrixBufferCount,
02167                                              int categoryCount,
02168                                              int scaleBufferCount,
02169                                              int resourceNumber,
02170                                              long preferenceFlags,
02171                                              long requirementFlags,
02172                                              int* errorCode) {
02173 
02174     BeagleImpl* impl = new BeagleCPUImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>();
02175 
02176     try {
02177         *errorCode =
02178             impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
02179                                  patternCount, eigenBufferCount, matrixBufferCount,
02180                                  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);
02181         if (*errorCode == BEAGLE_SUCCESS) {
02182             return impl;
02183         }
02184         delete impl;
02185         return NULL;
02186     }
02187     catch(...) {
02188         if (DEBUGGING_OUTPUT)
02189             std::cerr << "exception in initialize\n";
02190         delete impl;
02191         throw;
02192     }
02193 
02194     delete impl;
02195 
02196     return NULL;
02197 }
02198 
02199 
02200 BEAGLE_CPU_FACTORY_TEMPLATE
02201 const char* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
02202         return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
02203 }
02204 
02205 BEAGLE_CPU_FACTORY_TEMPLATE
02206 const long BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() {
02207     long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
02208                  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
02209                  BEAGLE_FLAG_THREADING_NONE |
02210                  BEAGLE_FLAG_PROCESSOR_CPU |
02211                  BEAGLE_FLAG_VECTOR_NONE |
02212                  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
02213                  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
02214                  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
02215         if (DOUBLE_PRECISION)
02216                 flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
02217         else
02218                 flags |= BEAGLE_FLAG_PRECISION_SINGLE;
02219     return flags;
02220 }
02221 
02222 }       // namespace cpu
02223 }       // namespace beagle
02224 
02225 #endif // BEAGLE_CPU_IMPL_GENERAL_HPP
02226